illumos-gate Wdiff usr/src/uts/common/vm/page_retire.c

Print this page

7127  remove -Wno-missing-braces from Makefile.uts

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/page_retire.c
          +++ new/usr/src/uts/common/vm/page_retire.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Page Retire - Big Theory Statement.
  28   28   *
  29   29   * This file handles removing sections of faulty memory from use when the
  30   30   * user land FMA Diagnosis Engine requests that a page be removed or when
  31   31   * a CE or UE is detected by the hardware.
  32   32   *
  33   33   * In the bad old days, the kernel side of Page Retire did a lot of the work
  34   34   * on its own. Now, with the DE keeping track of errors, the kernel side is
  35   35   * rather simple minded on most platforms.
  36   36   *
  37   37   * Errors are all reflected to the DE, and after digesting the error and
  38   38   * looking at all previously reported errors, the DE decides what should
  39   39   * be done about the current error. If the DE wants a particular page to
  40   40   * be retired, then the kernel page retire code is invoked via an ioctl.
  41   41   * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
  42   42   * page retire to handle the error. Since page retire is just a simple
  43   43   * mechanism it doesn't need to differentiate between the different callers.
  44   44   *
  45   45   * The p_toxic field in the page_t is used to indicate which errors have
  46   46   * occurred and what action has been taken on a given page. Because errors are
  47   47   * reported without regard to the locked state of a page, no locks are used
  48   48   * to SET the error bits in p_toxic. However, in order to clear the error
  49   49   * bits, the page_t must be held exclusively locked.
  50   50   *
  51   51   * When page_retire() is called, it must be able to acquire locks, sleep, etc.
  52   52   * It must not be called from high-level interrupt context.
  53   53   *
  54   54   * Depending on how the requested page is being used at the time of the retire
  55   55   * request (and on the availability of sufficient system resources), the page
  56   56   * may be retired immediately, or just marked for retirement later. For
  57   57   * example, locked pages are marked, while free pages are retired. Multiple
  58   58   * requests may be made to retire the same page, although there is no need
  59   59   * to: once the p_toxic flags are set, the page will be retired as soon as it
  60   60   * can be exclusively locked.
  61   61   *
  62   62   * The retire mechanism is driven centrally out of page_unlock(). To expedite
  63   63   * the retirement of pages, further requests for SE_SHARED locks are denied
  64   64   * as long as a page retirement is pending. In addition, as long as pages are
  65   65   * pending retirement a background thread runs periodically trying to retire
  66   66   * those pages. Pages which could not be retired while the system is running
  67   67   * are scrubbed prior to rebooting to avoid latent errors on the next boot.
  68   68   *
  69   69   * UE pages without persistent errors are scrubbed and returned to service.
  70   70   * Recidivist pages, as well as FMA-directed requests for retirement, result
  71   71   * in the page being taken out of service. Once the decision is made to take
  72   72   * a page out of service, the page is cleared, hashed onto the retired_pages
  73   73   * vnode, marked as retired, and it is unlocked.  No other requesters (except
  74   74   * for unretire) are allowed to lock retired pages.
  75   75   *
  76   76   * The public routines return (sadly) 0 if they worked and a non-zero error
  77   77   * value if something went wrong. This is done for the ioctl side of the
  78   78   * world to allow errors to be reflected all the way out to user land. The
  79   79   * non-zero values are explained in comments atop each function.
  80   80   */
  81   81  
  82   82  /*
  83   83   * Things to fix:
  84   84   *
  85   85   *      1. Trying to retire non-relocatable kvp pages may result in a
  86   86   *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
  87   87   *      and calls page_lookup() in the free path; since kvp pages are modified
  88   88   *      and don't have a usable backing store, page_retire() can't do anything
  89   89   *      with them, and we'll keep denying the lock to seg_kmem_free() in a
  90   90   *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
  91   91   *      hence only try to retire a page from page_unlock() in the free path.
  92   92   *      Since most kernel pages are indefinitely held anyway, and don't
  93   93   *      participate in I/O, this is of little consequence.
  94   94   *
  95   95   *      2. Low memory situations will be interesting. If we don't have
  96   96   *      enough memory for page_relocate() to succeed, we won't be able to
  97   97   *      retire dirty pages; nobody will be able to push them out to disk
  98   98   *      either, since we aggressively deny the page lock. We could change
  99   99   *      fsflush so it can recognize this situation, grab the lock, and push
 100  100   *      the page out, where we'll catch it in the free path and retire it.
 101  101   *
 102  102   *      3. Beware of places that have code like this in them:
 103  103   *
 104  104   *              if (! page_tryupgrade(pp)) {
 105  105   *                      page_unlock(pp);
 106  106   *                      while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
 107  107   *                              / *NOTHING* /
 108  108   *                      }
 109  109   *              }
 110  110   *              page_free(pp);
 111  111   *
 112  112   *      The problem is that pp can change identity right after the
 113  113   *      page_unlock() call.  In particular, page_retire() can step in
 114  114   *      there, change pp's identity, and hash pp onto the retired_vnode.
 115  115   *
 116  116   *      Of course, other functions besides page_retire() can have the
 117  117   *      same effect. A kmem reader can waltz by, set up a mapping to the
 118  118   *      page, and then unlock the page. Page_free() will then go castors
 119  119   *      up. So if anybody is doing this, it's already a bug.
 120  120   *
 121  121   *      4. mdboot()'s call into page_retire_mdboot() should probably be
 122  122   *      moved lower. Where the call is made now, we can get into trouble
 123  123   *      by scrubbing a kernel page that is then accessed later.
 124  124   */
 125  125  
 126  126  #include <sys/types.h>
 127  127  #include <sys/param.h>
 128  128  #include <sys/systm.h>
 129  129  #include <sys/mman.h>
 130  130  #include <sys/vnode.h>
 131  131  #include <sys/vfs_opreg.h>
 132  132  #include <sys/cmn_err.h>
 133  133  #include <sys/ksynch.h>
 134  134  #include <sys/thread.h>
 135  135  #include <sys/disp.h>
 136  136  #include <sys/ontrap.h>
 137  137  #include <sys/vmsystm.h>
 138  138  #include <sys/mem_config.h>
 139  139  #include <sys/atomic.h>
 140  140  #include <sys/callb.h>
 141  141  #include <sys/kobj.h>
 142  142  #include <vm/page.h>
 143  143  #include <vm/vm_dep.h>
 144  144  #include <vm/as.h>
 145  145  #include <vm/hat.h>
 146  146  #include <vm/seg_kmem.h>
 147  147  
 148  148  /*
 149  149   * vnode for all pages which are retired from the VM system;
 150  150   */
 151  151  vnode_t *retired_pages;
 152  152  
 153  153  static int page_retire_pp_finish(page_t *, void *, uint_t);
 154  154  
 155  155  /*
 156  156   * Make a list of all of the pages that have been marked for retirement
 157  157   * but are not yet retired.  At system shutdown, we will scrub all of the
 158  158   * pages in the list in case there are outstanding UEs.  Then, we
 159  159   * cross-check this list against the number of pages that are yet to be
 160  160   * retired, and if we find inconsistencies, we scan every page_t in the
 161  161   * whole system looking for any pages that need to be scrubbed for UEs.
 162  162   * The background thread also uses this queue to determine which pages
 163  163   * it should keep trying to retire.
 164  164   */
 165  165  #ifdef  DEBUG
 166  166  #define PR_PENDING_QMAX 32
 167  167  #else   /* DEBUG */
 168  168  #define PR_PENDING_QMAX 256
 169  169  #endif  /* DEBUG */
 170  170  page_t          *pr_pending_q[PR_PENDING_QMAX];
 171  171  kmutex_t        pr_q_mutex;
 172  172  
 173  173  /*
 174  174   * Page retire global kstats
 175  175   */
 176  176  struct page_retire_kstat {
 177  177          kstat_named_t   pr_retired;
 178  178          kstat_named_t   pr_requested;
 179  179          kstat_named_t   pr_requested_free;
 180  180          kstat_named_t   pr_enqueue_fail;
 181  181          kstat_named_t   pr_dequeue_fail;
 182  182          kstat_named_t   pr_pending;
 183  183          kstat_named_t   pr_pending_kas;
 184  184          kstat_named_t   pr_failed;
 185  185          kstat_named_t   pr_failed_kernel;
 186  186          kstat_named_t   pr_limit;
 187  187          kstat_named_t   pr_limit_exceeded;
 188  188          kstat_named_t   pr_fma;
 189  189          kstat_named_t   pr_mce;
 190  190          kstat_named_t   pr_ue;
 191  191          kstat_named_t   pr_ue_cleared_retire;
 192  192          kstat_named_t   pr_ue_cleared_free;
 193  193          kstat_named_t   pr_ue_persistent;
 194  194          kstat_named_t   pr_unretired;
 195  195  };
 196  196  
 197  197  static struct page_retire_kstat page_retire_kstat = {
 198  198          { "pages_retired",              KSTAT_DATA_UINT64},
 199  199          { "pages_retire_request",       KSTAT_DATA_UINT64},
 200  200          { "pages_retire_request_free",  KSTAT_DATA_UINT64},
 201  201          { "pages_notenqueued",          KSTAT_DATA_UINT64},
 202  202          { "pages_notdequeued",          KSTAT_DATA_UINT64},
 203  203          { "pages_pending",              KSTAT_DATA_UINT64},
 204  204          { "pages_pending_kas",          KSTAT_DATA_UINT64},
 205  205          { "pages_deferred",             KSTAT_DATA_UINT64},
 206  206          { "pages_deferred_kernel",      KSTAT_DATA_UINT64},
 207  207          { "pages_limit",                KSTAT_DATA_UINT64},
 208  208          { "pages_limit_exceeded",       KSTAT_DATA_UINT64},
 209  209          { "pages_fma",                  KSTAT_DATA_UINT64},
 210  210          { "pages_multiple_ce",          KSTAT_DATA_UINT64},
 211  211          { "pages_ue",                   KSTAT_DATA_UINT64},
 212  212          { "pages_ue_cleared_retired",   KSTAT_DATA_UINT64},
 213  213          { "pages_ue_cleared_freed",     KSTAT_DATA_UINT64},
 214  214          { "pages_ue_persistent",        KSTAT_DATA_UINT64},
 215  215          { "pages_unretired",            KSTAT_DATA_UINT64},
 216  216  };
 217  217  
 218  218  static kstat_t  *page_retire_ksp = NULL;
 219  219  
 220  220  #define PR_INCR_KSTAT(stat)     \
 221  221          atomic_inc_64(&(page_retire_kstat.stat.value.ui64))
 222  222  #define PR_DECR_KSTAT(stat)     \
 223  223          atomic_dec_64(&(page_retire_kstat.stat.value.ui64))
 224  224  
 225  225  #define PR_KSTAT_RETIRED_CE     (page_retire_kstat.pr_mce.value.ui64)
 226  226  #define PR_KSTAT_RETIRED_FMA    (page_retire_kstat.pr_fma.value.ui64)
 227  227  #define PR_KSTAT_RETIRED_NOTUE  (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
 228  228  #define PR_KSTAT_PENDING        (page_retire_kstat.pr_pending.value.ui64)
 229  229  #define PR_KSTAT_PENDING_KAS    (page_retire_kstat.pr_pending_kas.value.ui64)
 230  230  #define PR_KSTAT_EQFAIL         (page_retire_kstat.pr_enqueue_fail.value.ui64)
 231  231  #define PR_KSTAT_DQFAIL         (page_retire_kstat.pr_dequeue_fail.value.ui64)
 232  232  
 233  233  /*
 234  234   * page retire kstats to list all retired pages
 235  235   */
 236  236  static int pr_list_kstat_update(kstat_t *ksp, int rw);
 237  237  static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
 238  238  kmutex_t pr_list_kstat_mutex;
 239  239  
 240  240  /*
 241  241   * Limit the number of multiple CE page retires.
 242  242   * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
 243  243   * basis points, where 100 basis points equals one percent.
 244  244   */
 245  245  #define MCE_BPT 10
 246  246  uint64_t        max_pages_retired_bps = MCE_BPT;
 247  247  #define PAGE_RETIRE_LIMIT       ((physmem * max_pages_retired_bps) / 10000)
 248  248  
 249  249  /*
 250  250   * Control over the verbosity of page retirement.
 251  251   *
 252  252   * When set to zero (the default), no messages will be printed.
 253  253   * When set to one, summary messages will be printed.
 254  254   * When set > one, all messages will be printed.
 255  255   *
 256  256   * A value of one will trigger detailed messages for retirement operations,
 257  257   * and is intended as a platform tunable for processors where FMA's DE does
 258  258   * not run (e.g., spitfire). Values > one are intended for debugging only.
 259  259   */
 260  260  int page_retire_messages = 0;
 261  261  
 262  262  /*
 263  263   * Control whether or not we return scrubbed UE pages to service.
 264  264   * By default we do not since FMA wants to run its diagnostics first
 265  265   * and then ask us to unretire the page if it passes. Non-FMA platforms
 266  266   * may set this to zero so we will only retire recidivist pages. It should
 267  267   * not be changed by the user.
 268  268   */
 269  269  int page_retire_first_ue = 1;
 270  270  
 271  271  /*
 272  272   * Master enable for page retire. This prevents a CE or UE early in boot
 273  273   * from trying to retire a page before page_retire_init() has finished
 274  274   * setting things up. This is internal only and is not a tunable!
 275  275   */
 276  276  static int pr_enable = 0;
 277  277  
 278  278  static void (*memscrub_notify_func)(uint64_t);
 279  279  
 280  280  #ifdef  DEBUG
 281  281  struct page_retire_debug {
 282  282          int prd_dup1;
 283  283          int prd_dup2;
 284  284          int prd_qdup;
 285  285          int prd_noaction;
 286  286          int prd_queued;
 287  287          int prd_notqueued;
 288  288          int prd_dequeue;
 289  289          int prd_top;
 290  290          int prd_locked;
 291  291          int prd_reloc;
 292  292          int prd_relocfail;
 293  293          int prd_mod;
 294  294          int prd_mod_late;
 295  295          int prd_kern;
 296  296          int prd_free;
 297  297          int prd_noreclaim;
 298  298          int prd_hashout;
 299  299          int prd_fma;
 300  300          int prd_uescrubbed;
 301  301          int prd_uenotscrubbed;
 302  302          int prd_mce;
 303  303          int prd_prlocked;
 304  304          int prd_prnotlocked;
 305  305          int prd_prretired;
 306  306          int prd_ulocked;
 307  307          int prd_unotretired;
 308  308          int prd_udestroy;
 309  309          int prd_uhashout;
 310  310          int prd_uunretired;
 311  311          int prd_unotlocked;
 312  312          int prd_checkhit;
 313  313          int prd_checkmiss_pend;
 314  314          int prd_checkmiss_noerr;
 315  315          int prd_tctop;
 316  316          int prd_tclocked;
 317  317          int prd_hunt;
 318  318          int prd_dohunt;
 319  319          int prd_earlyhunt;
 320  320          int prd_latehunt;
 321  321          int prd_nofreedemote;
 322  322          int prd_nodemote;
 323  323          int prd_demoted;
 324  324  } pr_debug;
 325  325  
 326  326  #define PR_DEBUG(foo)   ((pr_debug.foo)++)
 327  327  
 328  328  /*
 329  329   * A type histogram. We record the incidence of the various toxic
 330  330   * flag combinations along with the interesting page attributes. The
 331  331   * goal is to get as many combinations as we can while driving all
 332  332   * pr_debug values nonzero (indicating we've exercised all possible
 333  333   * code paths across all possible page types). Not all combinations
 334  334   * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
 335  335   *
 336  336   * pr_type offset bit encoding (when examining with a debugger):
 337  337   *
 338  338   *    PRT_NAMED  - 0x4
 339  339   *    PRT_KERNEL - 0x8
 340  340   *    PRT_FREE   - 0x10
 341  341   *    PRT_MOD    - 0x20
 342  342   *    PRT_FMA    - 0x0
 343  343   *    PRT_MCE    - 0x40
 344  344   *    PRT_UE     - 0x80
 345  345   */
 346  346  
 347  347  #define PRT_NAMED       0x01
 348  348  #define PRT_KERNEL      0x02
 349  349  #define PRT_FREE        0x04
 350  350  #define PRT_MOD         0x08
 351  351  #define PRT_FMA         0x00    /* yes, this is not a mistake */
 352  352  #define PRT_MCE         0x10
 353  353  #define PRT_UE          0x20
 354  354  #define PRT_ALL         0x3F
 355  355  
 356  356  int pr_types[PRT_ALL+1];
 357  357  
 358  358  #define PR_TYPES(pp)    {                       \
 359  359          int whichtype = 0;                      \
 360  360          if (pp->p_vnode)                        \
 361  361                  whichtype |= PRT_NAMED;         \
 362  362          if (PP_ISKAS(pp))                       \
 363  363                  whichtype |= PRT_KERNEL;        \
 364  364          if (PP_ISFREE(pp))                      \
 365  365                  whichtype |= PRT_FREE;          \
 366  366          if (hat_ismod(pp))                      \
 367  367                  whichtype |= PRT_MOD;           \
 368  368          if (pp->p_toxic & PR_UE)                \
 369  369                  whichtype |= PRT_UE;            \
 370  370          if (pp->p_toxic & PR_MCE)               \
 371  371                  whichtype |= PRT_MCE;           \
 372  372          pr_types[whichtype]++;                  \
 373  373  }
 374  374  
 375  375  int recl_calls;
 376  376  int recl_mtbf = 3;
 377  377  int reloc_calls;
 378  378  int reloc_mtbf = 7;
 379  379  int pr_calls;
 380  380  int pr_mtbf = 15;
 381  381  
 382  382  #define MTBF(v, f)      (((++(v)) & (f)) != (f))
 383  383  
 384  384  #else   /* DEBUG */
 385  385  
 386  386  #define PR_DEBUG(foo)   /* nothing */
 387  387  #define PR_TYPES(foo)   /* nothing */
 388  388  #define MTBF(v, f)      (1)
 389  389  
 390  390  #endif  /* DEBUG */
 391  391  
 392  392  /*
 393  393   * page_retire_done() - completion processing
 394  394   *
 395  395   * Used by the page_retire code for common completion processing.
 396  396   * It keeps track of how many times a given result has happened,
 397  397   * and writes out an occasional message.
 398  398   *
 399  399   * May be called with a NULL pp (PRD_INVALID_PA case).
 400  400   */
 401  401  #define PRD_INVALID_KEY         -1
 402  402  #define PRD_SUCCESS             0
 403  403  #define PRD_PENDING             1
 404  404  #define PRD_FAILED              2
 405  405  #define PRD_DUPLICATE           3
 406  406  #define PRD_INVALID_PA          4
 407  407  #define PRD_LIMIT               5
 408  408  #define PRD_UE_SCRUBBED         6
 409  409  #define PRD_UNR_SUCCESS         7
 410  410  #define PRD_UNR_CANTLOCK        8
 411  411  #define PRD_UNR_NOT             9
 412  412  
 413  413  typedef struct page_retire_op {
 414  414          int     pr_key;         /* one of the PRD_* defines from above */
 415  415          int     pr_count;       /* How many times this has happened */
 416  416          int     pr_retval;      /* return value */
 417  417          int     pr_msglvl;      /* message level - when to print */
 418  418          char    *pr_message;    /* Cryptic message for field service */
 419  419  } page_retire_op_t;
 420  420  
 421  421  static page_retire_op_t page_retire_ops[] = {
 422  422          /* key                  count   retval  msglvl  message */
 423  423          {PRD_SUCCESS,           0,      0,      1,
 424  424                  "Page 0x%08x.%08x removed from service"},
 425  425          {PRD_PENDING,           0,      EAGAIN, 2,
 426  426                  "Page 0x%08x.%08x will be retired on free"},
 427  427          {PRD_FAILED,            0,      EAGAIN, 0, NULL},
 428  428          {PRD_DUPLICATE,         0,      EIO,    2,
 429  429                  "Page 0x%08x.%08x already retired or pending"},
 430  430          {PRD_INVALID_PA,        0,      EINVAL, 2,
 431  431                  "PA 0x%08x.%08x is not a relocatable page"},
 432  432          {PRD_LIMIT,             0,      0,      1,
 433  433                  "Page 0x%08x.%08x not retired due to limit exceeded"},
 434  434          {PRD_UE_SCRUBBED,       0,      0,      1,
 435  435                  "Previously reported error on page 0x%08x.%08x cleared"},
 436  436          {PRD_UNR_SUCCESS,       0,      0,      1,
 437  437                  "Page 0x%08x.%08x returned to service"},
 438  438          {PRD_UNR_CANTLOCK,      0,      EAGAIN, 2,
 439  439                  "Page 0x%08x.%08x could not be unretired"},
 440  440          {PRD_UNR_NOT,           0,      EIO,    2,
 441  441                  "Page 0x%08x.%08x is not retired"},
 442  442          {PRD_INVALID_KEY,       0,      0,      0, NULL} /* MUST BE LAST! */
 443  443  };
 444  444  
 445  445  /*
 446  446   * print a message if page_retire_messages is true.
 447  447   */
 448  448  #define PR_MESSAGE(debuglvl, msglvl, msg, pa)                           \
 449  449  {                                                                       \
 450  450          uint64_t p = (uint64_t)pa;                                      \
 451  451          if (page_retire_messages >= msglvl && msg != NULL) {            \
 452  452                  cmn_err(debuglvl, msg,                                  \
 453  453                      (uint32_t)(p >> 32), (uint32_t)p);                  \
 454  454          }                                                               \
 455  455  }
 456  456  
 457  457  /*
 458  458   * Note that multiple bits may be set in a single settoxic operation.
 459  459   * May be called without the page locked.
 460  460   */
 461  461  void
 462  462  page_settoxic(page_t *pp, uchar_t bits)
 463  463  {
 464  464          atomic_or_8(&pp->p_toxic, bits);
 465  465  }
 466  466  
 467  467  /*
 468  468   * Note that multiple bits may cleared in a single clrtoxic operation.
 469  469   * Must be called with the page exclusively locked to prevent races which
 470  470   * may attempt to retire a page without any toxic bits set.
 471  471   * Note that the PR_CAPTURE bit can be cleared without the exclusive lock
 472  472   * being held as there is a separate mutex which protects that bit.
 473  473   */
 474  474  void
 475  475  page_clrtoxic(page_t *pp, uchar_t bits)
 476  476  {
 477  477          ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp));
 478  478          atomic_and_8(&pp->p_toxic, ~bits);
 479  479  }
 480  480  
 481  481  /*
 482  482   * Prints any page retire messages to the user, and decides what
 483  483   * error code is appropriate for the condition reported.
 484  484   */
 485  485  static int
 486  486  page_retire_done(page_t *pp, int code)
 487  487  {
 488  488          page_retire_op_t *prop;
 489  489          uint64_t        pa = 0;
 490  490          int             i;
 491  491  
 492  492          if (pp != NULL) {
 493  493                  pa = mmu_ptob((uint64_t)pp->p_pagenum);
 494  494          }
 495  495  
 496  496          prop = NULL;
 497  497          for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
 498  498                  if (page_retire_ops[i].pr_key == code) {
 499  499                          prop = &page_retire_ops[i];
 500  500                          break;
 501  501                  }
 502  502          }
 503  503  
 504  504  #ifdef  DEBUG
 505  505          if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
 506  506                  cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
 507  507          }
 508  508  #endif
 509  509  
 510  510          ASSERT(prop->pr_key == code);
 511  511  
 512  512          prop->pr_count++;
 513  513  
 514  514          PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
 515  515          if (pp != NULL) {
 516  516                  page_settoxic(pp, PR_MSG);
 517  517          }
 518  518  
 519  519          return (prop->pr_retval);
 520  520  }
 521  521  
 522  522  /*
 523  523   * Act like page_destroy(), but instead of freeing the page, hash it onto
 524  524   * the retired_pages vnode, and mark it retired.
 525  525   *
 526  526   * For fun, we try to scrub the page until it's squeaky clean.
 527  527   * availrmem is adjusted here.
 528  528   */
 529  529  static void
 530  530  page_retire_destroy(page_t *pp)
 531  531  {
 532  532          u_offset_t off = (u_offset_t)((uintptr_t)pp);
 533  533  
 534  534          ASSERT(PAGE_EXCL(pp));
 535  535          ASSERT(!PP_ISFREE(pp));
 536  536          ASSERT(pp->p_szc == 0);
 537  537          ASSERT(!hat_page_is_mapped(pp));
 538  538          ASSERT(!pp->p_vnode);
 539  539  
 540  540          page_clr_all_props(pp);
 541  541          pagescrub(pp, 0, MMU_PAGESIZE);
 542  542  
 543  543          pp->p_next = NULL;
 544  544          pp->p_prev = NULL;
 545  545          if (page_hashin(pp, retired_pages, off, NULL) == 0) {
 546  546                  cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
 547  547          }
 548  548  
 549  549          page_settoxic(pp, PR_RETIRED);
 550  550          PR_INCR_KSTAT(pr_retired);
 551  551  
 552  552          if (pp->p_toxic & PR_FMA) {
 553  553                  PR_INCR_KSTAT(pr_fma);
 554  554          } else if (pp->p_toxic & PR_UE) {
 555  555                  PR_INCR_KSTAT(pr_ue);
 556  556          } else {
 557  557                  PR_INCR_KSTAT(pr_mce);
 558  558          }
 559  559  
 560  560          mutex_enter(&freemem_lock);
 561  561          availrmem--;
 562  562          mutex_exit(&freemem_lock);
 563  563  
 564  564          page_unlock(pp);
 565  565  }
 566  566  
 567  567  /*
 568  568   * Check whether the number of pages which have been retired already exceeds
 569  569   * the maximum allowable percentage of memory which may be retired.
 570  570   *
 571  571   * Returns 1 if the limit has been exceeded.
 572  572   */
 573  573  static int
 574  574  page_retire_limit(void)
 575  575  {
 576  576          if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
 577  577                  PR_INCR_KSTAT(pr_limit_exceeded);
 578  578                  return (1);
 579  579          }
 580  580  
 581  581          return (0);
 582  582  }
 583  583  
 584  584  #define MSG_DM  "Data Mismatch occurred at PA 0x%08x.%08x"              \
 585  585          "[ 0x%x != 0x%x ] while attempting to clear previously "        \
 586  586          "reported error; page removed from service"
 587  587  
 588  588  #define MSG_UE  "Uncorrectable Error occurred at PA 0x%08x.%08x while " \
 589  589          "attempting to clear previously reported error; page removed "  \
 590  590          "from service"
 591  591  
 592  592  /*
 593  593   * Attempt to clear a UE from a page.
 594  594   * Returns 1 if the error has been successfully cleared.
 595  595   */
 596  596  static int
 597  597  page_clear_transient_ue(page_t *pp)
 598  598  {
 599  599          caddr_t         kaddr;
 600  600          uint8_t         rb, wb;
 601  601          uint64_t        pa;
 602  602          uint32_t        pa_hi, pa_lo;
 603  603          on_trap_data_t  otd;
 604  604          int             errors = 0;
 605  605          int             i;
 606  606  
 607  607          ASSERT(PAGE_EXCL(pp));
 608  608          ASSERT(PP_PR_REQ(pp));
 609  609          ASSERT(pp->p_szc == 0);
 610  610          ASSERT(!hat_page_is_mapped(pp));
 611  611  
 612  612          /*
 613  613           * Clear the page and attempt to clear the UE.  If we trap
 614  614           * on the next access to the page, we know the UE has recurred.
 615  615           */
 616  616          pagescrub(pp, 0, PAGESIZE);
 617  617  
 618  618          /*
 619  619           * Map the page and write a bunch of bit patterns to compare
 620  620           * what we wrote with what we read back.  This isn't a perfect
 621  621           * test but it should be good enough to catch most of the
 622  622           * recurring UEs. If this fails to catch a recurrent UE, we'll
 623  623           * retire the page the next time we see a UE on the page.
 624  624           */
 625  625          kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
 626  626  
 627  627          pa = ptob((uint64_t)page_pptonum(pp));
 628  628          pa_hi = (uint32_t)(pa >> 32);
 629  629          pa_lo = (uint32_t)pa;
 630  630  
 631  631          /*
 632  632           * Disable preemption to prevent the off chance that
 633  633           * we migrate while in the middle of running through
 634  634           * the bit pattern and run on a different processor
 635  635           * than what we started on.
 636  636           */
 637  637          kpreempt_disable();
 638  638  
 639  639          /*
 640  640           * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
 641  641           * the cache in between reading and writing.  We do this under
 642  642           * on_trap() protection to avoid recursion.
 643  643           */
 644  644          if (on_trap(&otd, OT_DATA_EC)) {
 645  645                  PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
 646  646                  errors = 1;
 647  647          } else {
 648  648                  for (wb = 0xff; wb > 0; wb--) {
 649  649                          for (i = 0; i < PAGESIZE; i++) {
 650  650                                  kaddr[i] = wb;
 651  651                          }
 652  652  
 653  653                          sync_data_memory(kaddr, PAGESIZE);
 654  654  
 655  655                          for (i = 0; i < PAGESIZE; i++) {
 656  656                                  rb = kaddr[i];
 657  657                                  if (rb != wb) {
 658  658                                          /*
 659  659                                           * We had a mismatch without a trap.
 660  660                                           * Uh-oh. Something is really wrong
 661  661                                           * with this system.
 662  662                                           */
 663  663                                          if (page_retire_messages) {
 664  664                                                  cmn_err(CE_WARN, MSG_DM,
 665  665                                                      pa_hi, pa_lo, rb, wb);
 666  666                                          }
 667  667                                          errors = 1;
 668  668                                          goto out;       /* double break */
 669  669                                  }
 670  670                          }
 671  671                  }
 672  672          }
 673  673  out:
 674  674          no_trap();
 675  675          kpreempt_enable();
 676  676          ppmapout(kaddr);
 677  677  
 678  678          return (errors ? 0 : 1);
 679  679  }
 680  680  
 681  681  /*
 682  682   * Try to clear a page_t with a single UE. If the UE was transient, it is
 683  683   * returned to service, and we return 1. Otherwise we return 0 meaning
 684  684   * that further processing is required to retire the page.
 685  685   */
 686  686  static int
 687  687  page_retire_transient_ue(page_t *pp)
 688  688  {
 689  689          ASSERT(PAGE_EXCL(pp));
 690  690          ASSERT(!hat_page_is_mapped(pp));
 691  691  
 692  692          /*
 693  693           * If this page is a repeat offender, retire him under the
 694  694           * "two strikes and you're out" rule. The caller is responsible
 695  695           * for scrubbing the page to try to clear the error.
 696  696           */
 697  697          if (pp->p_toxic & PR_UE_SCRUBBED) {
 698  698                  PR_INCR_KSTAT(pr_ue_persistent);
 699  699                  return (0);
 700  700          }
 701  701  
 702  702          if (page_clear_transient_ue(pp)) {
 703  703                  /*
 704  704                   * We set the PR_SCRUBBED_UE bit; if we ever see this
 705  705                   * page again, we will retire it, no questions asked.
 706  706                   */
 707  707                  page_settoxic(pp, PR_UE_SCRUBBED);
 708  708  
 709  709                  if (page_retire_first_ue) {
 710  710                          PR_INCR_KSTAT(pr_ue_cleared_retire);
 711  711                          return (0);
 712  712                  } else {
 713  713                          PR_INCR_KSTAT(pr_ue_cleared_free);
 714  714  
 715  715                          page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG);
 716  716  
 717  717                          /* LINTED: CONSTCOND */
 718  718                          VN_DISPOSE(pp, B_FREE, 1, kcred);
 719  719                          return (1);
 720  720                  }
 721  721          }
 722  722  
 723  723          PR_INCR_KSTAT(pr_ue_persistent);
 724  724          return (0);
 725  725  }
 726  726  
 727  727  /*
 728  728   * Update the statistics dynamically when our kstat is read.
 729  729   */
 730  730  static int
 731  731  page_retire_kstat_update(kstat_t *ksp, int rw)
 732  732  {
 733  733          struct page_retire_kstat *pr;
 734  734  
 735  735          if (ksp == NULL)
 736  736                  return (EINVAL);
 737  737  
 738  738          switch (rw) {
 739  739  
 740  740          case KSTAT_READ:
 741  741                  pr = (struct page_retire_kstat *)ksp->ks_data;
 742  742                  ASSERT(pr == &page_retire_kstat);
 743  743                  pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
 744  744                  return (0);
 745  745  
 746  746          case KSTAT_WRITE:
 747  747                  return (EACCES);
 748  748  
 749  749          default:
 750  750                  return (EINVAL);
 751  751          }
 752  752          /*NOTREACHED*/
 753  753  }
 754  754  
 755  755  static int
 756  756  pr_list_kstat_update(kstat_t *ksp, int rw)
 757  757  {
 758  758          uint_t count;
 759  759          page_t *pp;
 760  760          kmutex_t *vphm;
 761  761  
 762  762          if (rw == KSTAT_WRITE)
 763  763                  return (EACCES);
 764  764  
 765  765          vphm = page_vnode_mutex(retired_pages);
 766  766          mutex_enter(vphm);
 767  767          /* Needs to be under a lock so that for loop will work right */
 768  768          if (retired_pages->v_pages == NULL) {
 769  769                  mutex_exit(vphm);
 770  770                  ksp->ks_ndata = 0;
 771  771                  ksp->ks_data_size = 0;
 772  772                  return (0);
 773  773          }
 774  774  
 775  775          count = 1;
 776  776          for (pp = retired_pages->v_pages->p_vpnext;
 777  777              pp != retired_pages->v_pages; pp = pp->p_vpnext) {
 778  778                  count++;
 779  779          }
 780  780          mutex_exit(vphm);
 781  781  
 782  782          ksp->ks_ndata = count;
 783  783          ksp->ks_data_size = count * 2 * sizeof (uint64_t);
 784  784  
 785  785          return (0);
 786  786  }
 787  787  
 788  788  /*
 789  789   * all spans will be pagesize and no coalescing will be done with the
 790  790   * list produced.
 791  791   */
 792  792  static int
 793  793  pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
 794  794  {
 795  795          kmutex_t *vphm;
 796  796          page_t *pp;
 797  797          struct memunit {
 798  798                  uint64_t address;
 799  799                  uint64_t size;
 800  800          } *kspmem;
 801  801  
 802  802          if (rw == KSTAT_WRITE)
 803  803                  return (EACCES);
 804  804  
 805  805          ksp->ks_snaptime = gethrtime();
 806  806  
 807  807          kspmem = (struct memunit *)buf;
 808  808  
 809  809          vphm = page_vnode_mutex(retired_pages);
 810  810          mutex_enter(vphm);
 811  811          pp = retired_pages->v_pages;
 812  812          if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) ||
 813  813              (pp == NULL)) {
 814  814                  mutex_exit(vphm);
 815  815                  return (0);
 816  816          }
 817  817          kspmem->address = ptob(pp->p_pagenum);
 818  818          kspmem->size = PAGESIZE;
 819  819          kspmem++;
 820  820          for (pp = pp->p_vpnext; pp != retired_pages->v_pages;
 821  821              pp = pp->p_vpnext, kspmem++) {
 822  822                  if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
 823  823                          break;
 824  824                  kspmem->address = ptob(pp->p_pagenum);
 825  825                  kspmem->size = PAGESIZE;
 826  826          }
 827  827          mutex_exit(vphm);
 828  828  
 829  829          return (0);
 830  830  }
 831  831  
 832  832  /*
 833  833   * page_retire_pend_count -- helper function for page_capture_thread,
 834  834   * returns the number of pages pending retirement.
 835  835   */
 836  836  uint64_t
 837  837  page_retire_pend_count(void)
 838  838  {
 839  839          return (PR_KSTAT_PENDING);
 840  840  }
 841  841  
 842  842  uint64_t
 843  843  page_retire_pend_kas_count(void)
 844  844  {
 845  845          return (PR_KSTAT_PENDING_KAS);
 846  846  }
 847  847  
 848  848  void
 849  849  page_retire_incr_pend_count(void *datap)
 850  850  {
 851  851          PR_INCR_KSTAT(pr_pending);
 852  852  
 853  853          if ((datap == &kvp) || (datap == &zvp)) {
 854  854                  PR_INCR_KSTAT(pr_pending_kas);
 855  855          }
 856  856  }
 857  857  
 858  858  void
 859  859  page_retire_decr_pend_count(void *datap)
 860  860  {
 861  861          PR_DECR_KSTAT(pr_pending);
 862  862  
 863  863          if ((datap == &kvp) || (datap == &zvp)) {
 864  864                  PR_DECR_KSTAT(pr_pending_kas);
 865  865          }
 866  866  }
 867  867  
 868  868  /*
 869  869   * Initialize the page retire mechanism:
 870  870   *
 871  871   *   - Establish the correctable error retire limit.

↓ open down ↓

871 lines elided

↑ open up ↑

 872  872   *   - Initialize locks.
 873  873   *   - Build the retired_pages vnode.
 874  874   *   - Set up the kstats.
 875  875   *   - Fire off the background thread.
 876  876   *   - Tell page_retire() it's OK to start retiring pages.
 877  877   */
 878  878  void
 879  879  page_retire_init(void)
 880  880  {
 881  881          const fs_operation_def_t retired_vnodeops_template[] = {
 882      -                { NULL, NULL }
      882 +                { NULL, {NULL} }
 883  883          };
 884  884          struct vnodeops *vops;
 885  885          kstat_t *ksp;
 886  886  
 887  887          const uint_t page_retire_ndata =
 888  888              sizeof (page_retire_kstat) / sizeof (kstat_named_t);
 889  889  
 890  890          ASSERT(page_retire_ksp == NULL);
 891  891  
 892  892          if (max_pages_retired_bps <= 0) {

 893  893                  max_pages_retired_bps = MCE_BPT;
 894  894          }
 895  895  
 896  896          mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
 897  897  
 898  898          retired_pages = vn_alloc(KM_SLEEP);
 899  899          if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
 900  900                  cmn_err(CE_PANIC,
 901  901                      "page_retired_init: can't make retired vnodeops");
 902  902          }
 903  903          vn_setops(retired_pages, vops);
 904  904  
 905  905          if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
 906  906              "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
 907  907              KSTAT_FLAG_VIRTUAL)) == NULL) {
 908  908                  cmn_err(CE_WARN, "kstat_create for page_retire failed");
 909  909          } else {
 910  910                  page_retire_ksp->ks_data = (void *)&page_retire_kstat;
 911  911                  page_retire_ksp->ks_update = page_retire_kstat_update;
 912  912                  kstat_install(page_retire_ksp);
 913  913          }
 914  914  
 915  915          mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
 916  916          ksp = kstat_create("unix", 0, "page_retire_list", "misc",
 917  917              KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
 918  918          if (ksp != NULL) {
 919  919                  ksp->ks_update = pr_list_kstat_update;
 920  920                  ksp->ks_snapshot = pr_list_kstat_snapshot;
 921  921                  ksp->ks_lock = &pr_list_kstat_mutex;
 922  922                  kstat_install(ksp);
 923  923          }
 924  924  
 925  925          memscrub_notify_func =
 926  926              (void(*)(uint64_t))kobj_getsymvalue("memscrub_notify", 0);
 927  927  
 928  928          page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish);
 929  929          pr_enable = 1;
 930  930  }
 931  931  
 932  932  /*
 933  933   * page_retire_hunt() callback for the retire thread.
 934  934   */
 935  935  static void
 936  936  page_retire_thread_cb(page_t *pp)
 937  937  {
 938  938          PR_DEBUG(prd_tctop);
 939  939          if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) {
 940  940                  PR_DEBUG(prd_tclocked);
 941  941                  page_unlock(pp);
 942  942          }
 943  943  }
 944  944  
 945  945  /*
 946  946   * Callback used by page_trycapture() to finish off retiring a page.
 947  947   * The page has already been cleaned and we've been given sole access to
 948  948   * it.
 949  949   * Always returns 0 to indicate that callback succeded as the callback never
 950  950   * fails to finish retiring the given page.
 951  951   */
 952  952  /*ARGSUSED*/
 953  953  static int
 954  954  page_retire_pp_finish(page_t *pp, void *notused, uint_t flags)
 955  955  {
 956  956          int             toxic;
 957  957  
 958  958          ASSERT(PAGE_EXCL(pp));
 959  959          ASSERT(pp->p_iolock_state == 0);
 960  960          ASSERT(pp->p_szc == 0);
 961  961  
 962  962          toxic = pp->p_toxic;
 963  963  
 964  964          /*
 965  965           * The problem page is locked, demoted, unmapped, not free,
 966  966           * hashed out, and not COW or mlocked (whew!).
 967  967           *
 968  968           * Now we select our ammunition, take it around back, and shoot it.
 969  969           */
 970  970          if (toxic & PR_UE) {
 971  971  ue_error:
 972  972                  if (page_retire_transient_ue(pp)) {
 973  973                          PR_DEBUG(prd_uescrubbed);
 974  974                          (void) page_retire_done(pp, PRD_UE_SCRUBBED);
 975  975                  } else {
 976  976                          PR_DEBUG(prd_uenotscrubbed);
 977  977                          page_retire_destroy(pp);
 978  978                          (void) page_retire_done(pp, PRD_SUCCESS);
 979  979                  }
 980  980                  return (0);
 981  981          } else if (toxic & PR_FMA) {
 982  982                  PR_DEBUG(prd_fma);
 983  983                  page_retire_destroy(pp);
 984  984                  (void) page_retire_done(pp, PRD_SUCCESS);
 985  985                  return (0);
 986  986          } else if (toxic & PR_MCE) {
 987  987                  PR_DEBUG(prd_mce);
 988  988                  page_retire_destroy(pp);
 989  989                  (void) page_retire_done(pp, PRD_SUCCESS);
 990  990                  return (0);
 991  991          }
 992  992  
 993  993          /*
 994  994           * When page_retire_first_ue is set to zero and a UE occurs which is
 995  995           * transient, it's possible that we clear some flags set by a second
 996  996           * UE error on the page which occurs while the first is currently being
 997  997           * handled and thus we need to handle the case where none of the above
 998  998           * are set.  In this instance, PR_UE_SCRUBBED should be set and thus
 999  999           * we should execute the UE code above.
1000 1000           */
1001 1001          if (toxic & PR_UE_SCRUBBED) {
1002 1002                  goto ue_error;
1003 1003          }
1004 1004  
1005 1005          /*
1006 1006           * It's impossible to get here.
1007 1007           */
1008 1008          panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic);
1009 1009          return (0);
1010 1010  }
1011 1011  
1012 1012  /*
1013 1013   * page_retire() - the front door in to retire a page.
1014 1014   *
1015 1015   * Ideally, page_retire() would instantly retire the requested page.
1016 1016   * Unfortunately, some pages are locked or otherwise tied up and cannot be
1017 1017   * retired right away.  We use the page capture logic to deal with this
1018 1018   * situation as it will continuously try to retire the page in the background
1019 1019   * if the first attempt fails.  Success is determined by looking to see whether
1020 1020   * the page has been retired after the page_trycapture() attempt.
1021 1021   *
1022 1022   * Returns:
1023 1023   *
1024 1024   *   - 0 on success,
1025 1025   *   - EINVAL when the PA is whacko,
1026 1026   *   - EIO if the page is already retired or already pending retirement, or
1027 1027   *   - EAGAIN if the page could not be _immediately_ retired but is pending.
1028 1028   */
1029 1029  int
1030 1030  page_retire(uint64_t pa, uchar_t reason)
1031 1031  {
1032 1032          page_t  *pp;
1033 1033  
1034 1034          ASSERT(reason & PR_REASONS);            /* there must be a reason */
1035 1035          ASSERT(!(reason & ~PR_REASONS));        /* but no other bits */
1036 1036  
1037 1037          pp = page_numtopp_nolock(mmu_btop(pa));
1038 1038          if (pp == NULL) {
1039 1039                  PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
1040 1040                      " page 0x%08x.%08x; page is not relocatable memory", pa);
1041 1041                  return (page_retire_done(pp, PRD_INVALID_PA));
1042 1042          }
1043 1043          if (PP_RETIRED(pp)) {
1044 1044                  PR_DEBUG(prd_dup1);
1045 1045                  return (page_retire_done(pp, PRD_DUPLICATE));
1046 1046          }
1047 1047  
1048 1048          if (memscrub_notify_func != NULL) {
1049 1049                  (void) memscrub_notify_func(pa);
1050 1050          }
1051 1051  
1052 1052          if ((reason & PR_UE) && !PP_TOXIC(pp)) {
1053 1053                  PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
1054 1054                      " page 0x%08x.%08x", pa);
1055 1055          } else if (PP_PR_REQ(pp)) {
1056 1056                  PR_DEBUG(prd_dup2);
1057 1057                  return (page_retire_done(pp, PRD_DUPLICATE));
1058 1058          } else {
1059 1059                  PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
1060 1060                      " page 0x%08x.%08x", pa);
1061 1061          }
1062 1062  
1063 1063          /* Avoid setting toxic bits in the first place */
1064 1064          if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) &&
1065 1065              page_retire_limit()) {
1066 1066                  return (page_retire_done(pp, PRD_LIMIT));
1067 1067          }
1068 1068  
1069 1069          if (MTBF(pr_calls, pr_mtbf)) {
1070 1070                  page_settoxic(pp, reason);
1071 1071                  if (page_trycapture(pp, 0, CAPTURE_RETIRE, pp->p_vnode) == 0) {
1072 1072                          PR_DEBUG(prd_prlocked);
1073 1073                  } else {
1074 1074                          PR_DEBUG(prd_prnotlocked);
1075 1075                  }
1076 1076          } else {
1077 1077                  PR_DEBUG(prd_prnotlocked);
1078 1078          }
1079 1079  
1080 1080          if (PP_RETIRED(pp)) {
1081 1081                  PR_DEBUG(prd_prretired);
1082 1082                  return (0);
1083 1083          } else {
1084 1084                  cv_signal(&pc_cv);
1085 1085                  PR_INCR_KSTAT(pr_failed);
1086 1086  
1087 1087                  if (pp->p_toxic & PR_MSG) {
1088 1088                          return (page_retire_done(pp, PRD_FAILED));
1089 1089                  } else {
1090 1090                          return (page_retire_done(pp, PRD_PENDING));
1091 1091                  }
1092 1092          }
1093 1093  }
1094 1094  
1095 1095  /*
1096 1096   * Take a retired page off the retired-pages vnode and clear the toxic flags.
1097 1097   * If "free" is nonzero, lock it and put it back on the freelist. If "free"
1098 1098   * is zero, the caller already holds SE_EXCL lock so we simply unretire it
1099 1099   * and don't do anything else with it.
1100 1100   *
1101 1101   * Any unretire messages are printed from this routine.
1102 1102   *
1103 1103   * Returns 0 if page pp was unretired; else an error code.
1104 1104   *
1105 1105   * If flags is:
1106 1106   *      PR_UNR_FREE - lock the page, clear the toxic flags and free it
1107 1107   *          to the freelist.
1108 1108   *      PR_UNR_TEMP - lock the page, unretire it, leave the toxic
1109 1109   *          bits set as is and return it to the caller.
1110 1110   *      PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the
1111 1111   *          toxic flags and return it to caller as is.
1112 1112   */
1113 1113  int
1114 1114  page_unretire_pp(page_t *pp, int flags)
1115 1115  {
1116 1116          /*
1117 1117           * To be retired, a page has to be hashed onto the retired_pages vnode
1118 1118           * and have PR_RETIRED set in p_toxic.
1119 1119           */
1120 1120          if (flags == PR_UNR_CLEAN ||
1121 1121              page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
1122 1122                  ASSERT(PAGE_EXCL(pp));
1123 1123                  PR_DEBUG(prd_ulocked);
1124 1124                  if (!PP_RETIRED(pp)) {
1125 1125                          PR_DEBUG(prd_unotretired);
1126 1126                          page_unlock(pp);
1127 1127                          return (page_retire_done(pp, PRD_UNR_NOT));
1128 1128                  }
1129 1129  
1130 1130                  PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
1131 1131                      " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum));
1132 1132                  if (pp->p_toxic & PR_FMA) {
1133 1133                          PR_DECR_KSTAT(pr_fma);
1134 1134                  } else if (pp->p_toxic & PR_UE) {
1135 1135                          PR_DECR_KSTAT(pr_ue);
1136 1136                  } else {
1137 1137                          PR_DECR_KSTAT(pr_mce);
1138 1138                  }
1139 1139  
1140 1140                  if (flags == PR_UNR_TEMP)
1141 1141                          page_clrtoxic(pp, PR_RETIRED);
1142 1142                  else
1143 1143                          page_clrtoxic(pp, PR_TOXICFLAGS);
1144 1144  
1145 1145                  if (flags == PR_UNR_FREE) {
1146 1146                          PR_DEBUG(prd_udestroy);
1147 1147                          page_destroy(pp, 0);
1148 1148                  } else {
1149 1149                          PR_DEBUG(prd_uhashout);
1150 1150                          page_hashout(pp, NULL);
1151 1151                  }
1152 1152  
1153 1153                  mutex_enter(&freemem_lock);
1154 1154                  availrmem++;
1155 1155                  mutex_exit(&freemem_lock);
1156 1156  
1157 1157                  PR_DEBUG(prd_uunretired);
1158 1158                  PR_DECR_KSTAT(pr_retired);
1159 1159                  PR_INCR_KSTAT(pr_unretired);
1160 1160                  return (page_retire_done(pp, PRD_UNR_SUCCESS));
1161 1161          }
1162 1162          PR_DEBUG(prd_unotlocked);
1163 1163          return (page_retire_done(pp, PRD_UNR_CANTLOCK));
1164 1164  }
1165 1165  
1166 1166  /*
1167 1167   * Return a page to service by moving it from the retired_pages vnode
1168 1168   * onto the freelist.
1169 1169   *
1170 1170   * Called from mmioctl_page_retire() on behalf of the FMA DE.
1171 1171   *
1172 1172   * Returns:
1173 1173   *
1174 1174   *   - 0 if the page is unretired,
1175 1175   *   - EAGAIN if the pp can not be locked,
1176 1176   *   - EINVAL if the PA is whacko, and
1177 1177   *   - EIO if the pp is not retired.
1178 1178   */
1179 1179  int
1180 1180  page_unretire(uint64_t pa)
1181 1181  {
1182 1182          page_t  *pp;
1183 1183  
1184 1184          pp = page_numtopp_nolock(mmu_btop(pa));
1185 1185          if (pp == NULL) {
1186 1186                  return (page_retire_done(pp, PRD_INVALID_PA));
1187 1187          }
1188 1188  
1189 1189          return (page_unretire_pp(pp, PR_UNR_FREE));
1190 1190  }
1191 1191  
1192 1192  /*
1193 1193   * Test a page to see if it is retired. If errors is non-NULL, the toxic
1194 1194   * bits of the page are returned. Returns 0 on success, error code on failure.
1195 1195   */
1196 1196  int
1197 1197  page_retire_check_pp(page_t *pp, uint64_t *errors)
1198 1198  {
1199 1199          int rc;
1200 1200  
1201 1201          if (PP_RETIRED(pp)) {
1202 1202                  PR_DEBUG(prd_checkhit);
1203 1203                  rc = 0;
1204 1204          } else if (PP_PR_REQ(pp)) {
1205 1205                  PR_DEBUG(prd_checkmiss_pend);
1206 1206                  rc = EAGAIN;
1207 1207          } else {
1208 1208                  PR_DEBUG(prd_checkmiss_noerr);
1209 1209                  rc = EIO;
1210 1210          }
1211 1211  
1212 1212          /*
1213 1213           * We have magically arranged the bit values returned to fmd(1M)
1214 1214           * to line up with the FMA, MCE, and UE bits of the page_t.
1215 1215           */
1216 1216          if (errors) {
1217 1217                  uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK);
1218 1218                  if (toxic & PR_UE_SCRUBBED) {
1219 1219                          toxic &= ~PR_UE_SCRUBBED;
1220 1220                          toxic |= PR_UE;
1221 1221                  }
1222 1222                  *errors = toxic;
1223 1223          }
1224 1224  
1225 1225          return (rc);
1226 1226  }
1227 1227  
1228 1228  /*
1229 1229   * Test to see if the page_t for a given PA is retired, and return the
1230 1230   * hardware errors we have seen on the page if requested.
1231 1231   *
1232 1232   * Called from mmioctl_page_retire on behalf of the FMA DE.
1233 1233   *
1234 1234   * Returns:
1235 1235   *
1236 1236   *   - 0 if the page is retired,
1237 1237   *   - EIO if the page is not retired and has no errors,
1238 1238   *   - EAGAIN if the page is not retired but is pending; and
1239 1239   *   - EINVAL if the PA is whacko.
1240 1240   */
1241 1241  int
1242 1242  page_retire_check(uint64_t pa, uint64_t *errors)
1243 1243  {
1244 1244          page_t  *pp;
1245 1245  
1246 1246          if (errors) {
1247 1247                  *errors = 0;
1248 1248          }
1249 1249  
1250 1250          pp = page_numtopp_nolock(mmu_btop(pa));
1251 1251          if (pp == NULL) {
1252 1252                  return (page_retire_done(pp, PRD_INVALID_PA));
1253 1253          }
1254 1254  
1255 1255          return (page_retire_check_pp(pp, errors));
1256 1256  }
1257 1257  
1258 1258  /*
1259 1259   * Page retire self-test. For now, it always returns 0.
1260 1260   */
1261 1261  int
1262 1262  page_retire_test(void)
1263 1263  {
1264 1264          page_t *first, *pp, *cpp, *cpp2, *lpp;
1265 1265  
1266 1266          /*
1267 1267           * Tests the corner case where a large page can't be retired
1268 1268           * because one of the constituent pages is locked. We mark
1269 1269           * one page to be retired and try to retire it, and mark the
1270 1270           * other page to be retired but don't try to retire it, so
1271 1271           * that page_unlock() in the failure path will recurse and try
1272 1272           * to retire THAT page. This is the worst possible situation
1273 1273           * we can get ourselves into.
1274 1274           */
1275 1275          memsegs_lock(0);
1276 1276          pp = first = page_first();
1277 1277          do {
1278 1278                  if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
1279 1279                          cpp = pp + 1;
1280 1280                          lpp = PP_ISFREE(pp)? pp : pp + 2;
1281 1281                          cpp2 = pp + 3;
1282 1282                          if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
1283 1283                                  continue;
1284 1284                          if (!page_trylock(cpp, SE_EXCL)) {
1285 1285                                  page_unlock(lpp);
1286 1286                                  continue;
1287 1287                          }
1288 1288  
1289 1289                          /* fails */
1290 1290                          (void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
1291 1291  
1292 1292                          page_unlock(lpp);
1293 1293                          page_unlock(cpp);
1294 1294                          (void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
1295 1295                          (void) page_retire(ptob(cpp2->p_pagenum), PR_FMA);
1296 1296                  }
1297 1297          } while ((pp = page_next(pp)) != first);
1298 1298          memsegs_unlock(0);
1299 1299  
1300 1300          return (0);
1301 1301  }

↓ open down ↓

409 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX