1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Page Retire - Big Theory Statement.
  28  *
  29  * This file handles removing sections of faulty memory from use when the
  30  * user land FMA Diagnosis Engine requests that a page be removed or when
  31  * a CE or UE is detected by the hardware.
  32  *
  33  * In the bad old days, the kernel side of Page Retire did a lot of the work
  34  * on its own. Now, with the DE keeping track of errors, the kernel side is
  35  * rather simple minded on most platforms.
  36  *
  37  * Errors are all reflected to the DE, and after digesting the error and
  38  * looking at all previously reported errors, the DE decides what should
  39  * be done about the current error. If the DE wants a particular page to
  40  * be retired, then the kernel page retire code is invoked via an ioctl.
  41  * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
  42  * page retire to handle the error. Since page retire is just a simple
  43  * mechanism it doesn't need to differentiate between the different callers.
  44  *
  45  * The p_toxic field in the page_t is used to indicate which errors have
  46  * occurred and what action has been taken on a given page. Because errors are
  47  * reported without regard to the locked state of a page, no locks are used
  48  * to SET the error bits in p_toxic. However, in order to clear the error
  49  * bits, the page_t must be held exclusively locked.
  50  *
  51  * When page_retire() is called, it must be able to acquire locks, sleep, etc.
  52  * It must not be called from high-level interrupt context.
  53  *
  54  * Depending on how the requested page is being used at the time of the retire
  55  * request (and on the availability of sufficient system resources), the page
  56  * may be retired immediately, or just marked for retirement later. For
  57  * example, locked pages are marked, while free pages are retired. Multiple
  58  * requests may be made to retire the same page, although there is no need
  59  * to: once the p_toxic flags are set, the page will be retired as soon as it
  60  * can be exclusively locked.
  61  *
  62  * The retire mechanism is driven centrally out of page_unlock(). To expedite
  63  * the retirement of pages, further requests for SE_SHARED locks are denied
  64  * as long as a page retirement is pending. In addition, as long as pages are
  65  * pending retirement a background thread runs periodically trying to retire
  66  * those pages. Pages which could not be retired while the system is running
  67  * are scrubbed prior to rebooting to avoid latent errors on the next boot.
  68  *
  69  * UE pages without persistent errors are scrubbed and returned to service.
  70  * Recidivist pages, as well as FMA-directed requests for retirement, result
  71  * in the page being taken out of service. Once the decision is made to take
  72  * a page out of service, the page is cleared, hashed onto the retired_pages
  73  * vnode, marked as retired, and it is unlocked.  No other requesters (except
  74  * for unretire) are allowed to lock retired pages.
  75  *
  76  * The public routines return (sadly) 0 if they worked and a non-zero error
  77  * value if something went wrong. This is done for the ioctl side of the
  78  * world to allow errors to be reflected all the way out to user land. The
  79  * non-zero values are explained in comments atop each function.
  80  */
  81 
  82 /*
  83  * Things to fix:
  84  *
  85  *      1. Trying to retire non-relocatable kvp pages may result in a
  86  *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
  87  *      and calls page_lookup() in the free path; since kvp pages are modified
  88  *      and don't have a usable backing store, page_retire() can't do anything
  89  *      with them, and we'll keep denying the lock to seg_kmem_free() in a
  90  *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
  91  *      hence only try to retire a page from page_unlock() in the free path.
  92  *      Since most kernel pages are indefinitely held anyway, and don't
  93  *      participate in I/O, this is of little consequence.
  94  *
  95  *      2. Low memory situations will be interesting. If we don't have
  96  *      enough memory for page_relocate() to succeed, we won't be able to
  97  *      retire dirty pages; nobody will be able to push them out to disk
  98  *      either, since we aggressively deny the page lock. We could change
  99  *      fsflush so it can recognize this situation, grab the lock, and push
 100  *      the page out, where we'll catch it in the free path and retire it.
 101  *
 102  *      3. Beware of places that have code like this in them:
 103  *
 104  *              if (! page_tryupgrade(pp)) {
 105  *                      page_unlock(pp);
 106  *                      while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
 107  *                              / *NOTHING* /
 108  *                      }
 109  *              }
 110  *              page_free(pp);
 111  *
 112  *      The problem is that pp can change identity right after the
 113  *      page_unlock() call.  In particular, page_retire() can step in
 114  *      there, change pp's identity, and hash pp onto the retired_vnode.
 115  *
 116  *      Of course, other functions besides page_retire() can have the
 117  *      same effect. A kmem reader can waltz by, set up a mapping to the
 118  *      page, and then unlock the page. Page_free() will then go castors
 119  *      up. So if anybody is doing this, it's already a bug.
 120  *
 121  *      4. mdboot()'s call into page_retire_mdboot() should probably be
 122  *      moved lower. Where the call is made now, we can get into trouble
 123  *      by scrubbing a kernel page that is then accessed later.
 124  */
 125 
 126 #include <sys/types.h>
 127 #include <sys/param.h>
 128 #include <sys/systm.h>
 129 #include <sys/mman.h>
 130 #include <sys/vnode.h>
 131 #include <sys/vfs_opreg.h>
 132 #include <sys/cmn_err.h>
 133 #include <sys/ksynch.h>
 134 #include <sys/thread.h>
 135 #include <sys/disp.h>
 136 #include <sys/ontrap.h>
 137 #include <sys/vmsystm.h>
 138 #include <sys/mem_config.h>
 139 #include <sys/atomic.h>
 140 #include <sys/callb.h>
 141 #include <sys/kobj.h>
 142 #include <vm/page.h>
 143 #include <vm/vm_dep.h>
 144 #include <vm/as.h>
 145 #include <vm/hat.h>
 146 #include <vm/seg_kmem.h>
 147 
 148 /*
 149  * vnode for all pages which are retired from the VM system;
 150  */
 151 vnode_t *retired_pages;
 152 
 153 static int page_retire_pp_finish(page_t *, void *, uint_t);
 154 
 155 /*
 156  * Make a list of all of the pages that have been marked for retirement
 157  * but are not yet retired.  At system shutdown, we will scrub all of the
 158  * pages in the list in case there are outstanding UEs.  Then, we
 159  * cross-check this list against the number of pages that are yet to be
 160  * retired, and if we find inconsistencies, we scan every page_t in the
 161  * whole system looking for any pages that need to be scrubbed for UEs.
 162  * The background thread also uses this queue to determine which pages
 163  * it should keep trying to retire.
 164  */
 165 #ifdef  DEBUG
 166 #define PR_PENDING_QMAX 32
 167 #else   /* DEBUG */
 168 #define PR_PENDING_QMAX 256
 169 #endif  /* DEBUG */
 170 page_t          *pr_pending_q[PR_PENDING_QMAX];
 171 kmutex_t        pr_q_mutex;
 172 
 173 /*
 174  * Page retire global kstats
 175  */
 176 struct page_retire_kstat {
 177         kstat_named_t   pr_retired;
 178         kstat_named_t   pr_requested;
 179         kstat_named_t   pr_requested_free;
 180         kstat_named_t   pr_enqueue_fail;
 181         kstat_named_t   pr_dequeue_fail;
 182         kstat_named_t   pr_pending;
 183         kstat_named_t   pr_pending_kas;
 184         kstat_named_t   pr_failed;
 185         kstat_named_t   pr_failed_kernel;
 186         kstat_named_t   pr_limit;
 187         kstat_named_t   pr_limit_exceeded;
 188         kstat_named_t   pr_fma;
 189         kstat_named_t   pr_mce;
 190         kstat_named_t   pr_ue;
 191         kstat_named_t   pr_ue_cleared_retire;
 192         kstat_named_t   pr_ue_cleared_free;
 193         kstat_named_t   pr_ue_persistent;
 194         kstat_named_t   pr_unretired;
 195 };
 196 
 197 static struct page_retire_kstat page_retire_kstat = {
 198         { "pages_retired",              KSTAT_DATA_UINT64},
 199         { "pages_retire_request",       KSTAT_DATA_UINT64},
 200         { "pages_retire_request_free",  KSTAT_DATA_UINT64},
 201         { "pages_notenqueued",          KSTAT_DATA_UINT64},
 202         { "pages_notdequeued",          KSTAT_DATA_UINT64},
 203         { "pages_pending",              KSTAT_DATA_UINT64},
 204         { "pages_pending_kas",          KSTAT_DATA_UINT64},
 205         { "pages_deferred",             KSTAT_DATA_UINT64},
 206         { "pages_deferred_kernel",      KSTAT_DATA_UINT64},
 207         { "pages_limit",                KSTAT_DATA_UINT64},
 208         { "pages_limit_exceeded",       KSTAT_DATA_UINT64},
 209         { "pages_fma",                  KSTAT_DATA_UINT64},
 210         { "pages_multiple_ce",          KSTAT_DATA_UINT64},
 211         { "pages_ue",                   KSTAT_DATA_UINT64},
 212         { "pages_ue_cleared_retired",   KSTAT_DATA_UINT64},
 213         { "pages_ue_cleared_freed",     KSTAT_DATA_UINT64},
 214         { "pages_ue_persistent",        KSTAT_DATA_UINT64},
 215         { "pages_unretired",            KSTAT_DATA_UINT64},
 216 };
 217 
 218 static kstat_t  *page_retire_ksp = NULL;
 219 
 220 #define PR_INCR_KSTAT(stat)     \
 221         atomic_inc_64(&(page_retire_kstat.stat.value.ui64))
 222 #define PR_DECR_KSTAT(stat)     \
 223         atomic_dec_64(&(page_retire_kstat.stat.value.ui64))
 224 
 225 #define PR_KSTAT_RETIRED_CE     (page_retire_kstat.pr_mce.value.ui64)
 226 #define PR_KSTAT_RETIRED_FMA    (page_retire_kstat.pr_fma.value.ui64)
 227 #define PR_KSTAT_RETIRED_NOTUE  (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
 228 #define PR_KSTAT_PENDING        (page_retire_kstat.pr_pending.value.ui64)
 229 #define PR_KSTAT_PENDING_KAS    (page_retire_kstat.pr_pending_kas.value.ui64)
 230 #define PR_KSTAT_EQFAIL         (page_retire_kstat.pr_enqueue_fail.value.ui64)
 231 #define PR_KSTAT_DQFAIL         (page_retire_kstat.pr_dequeue_fail.value.ui64)
 232 
 233 /*
 234  * page retire kstats to list all retired pages
 235  */
 236 static int pr_list_kstat_update(kstat_t *ksp, int rw);
 237 static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
 238 kmutex_t pr_list_kstat_mutex;
 239 
 240 /*
 241  * Limit the number of multiple CE page retires.
 242  * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
 243  * basis points, where 100 basis points equals one percent.
 244  */
 245 #define MCE_BPT 10
 246 uint64_t        max_pages_retired_bps = MCE_BPT;
 247 #define PAGE_RETIRE_LIMIT       ((physmem * max_pages_retired_bps) / 10000)
 248 
 249 /*
 250  * Control over the verbosity of page retirement.
 251  *
 252  * When set to zero (the default), no messages will be printed.
 253  * When set to one, summary messages will be printed.
 254  * When set > one, all messages will be printed.
 255  *
 256  * A value of one will trigger detailed messages for retirement operations,
 257  * and is intended as a platform tunable for processors where FMA's DE does
 258  * not run (e.g., spitfire). Values > one are intended for debugging only.
 259  */
 260 int page_retire_messages = 0;
 261 
 262 /*
 263  * Control whether or not we return scrubbed UE pages to service.
 264  * By default we do not since FMA wants to run its diagnostics first
 265  * and then ask us to unretire the page if it passes. Non-FMA platforms
 266  * may set this to zero so we will only retire recidivist pages. It should
 267  * not be changed by the user.
 268  */
 269 int page_retire_first_ue = 1;
 270 
 271 /*
 272  * Master enable for page retire. This prevents a CE or UE early in boot
 273  * from trying to retire a page before page_retire_init() has finished
 274  * setting things up. This is internal only and is not a tunable!
 275  */
 276 static int pr_enable = 0;
 277 
 278 static void (*memscrub_notify_func)(uint64_t);
 279 
 280 #ifdef  DEBUG
 281 struct page_retire_debug {
 282         int prd_dup1;
 283         int prd_dup2;
 284         int prd_qdup;
 285         int prd_noaction;
 286         int prd_queued;
 287         int prd_notqueued;
 288         int prd_dequeue;
 289         int prd_top;
 290         int prd_locked;
 291         int prd_reloc;
 292         int prd_relocfail;
 293         int prd_mod;
 294         int prd_mod_late;
 295         int prd_kern;
 296         int prd_free;
 297         int prd_noreclaim;
 298         int prd_hashout;
 299         int prd_fma;
 300         int prd_uescrubbed;
 301         int prd_uenotscrubbed;
 302         int prd_mce;
 303         int prd_prlocked;
 304         int prd_prnotlocked;
 305         int prd_prretired;
 306         int prd_ulocked;
 307         int prd_unotretired;
 308         int prd_udestroy;
 309         int prd_uhashout;
 310         int prd_uunretired;
 311         int prd_unotlocked;
 312         int prd_checkhit;
 313         int prd_checkmiss_pend;
 314         int prd_checkmiss_noerr;
 315         int prd_tctop;
 316         int prd_tclocked;
 317         int prd_hunt;
 318         int prd_dohunt;
 319         int prd_earlyhunt;
 320         int prd_latehunt;
 321         int prd_nofreedemote;
 322         int prd_nodemote;
 323         int prd_demoted;
 324 } pr_debug;
 325 
 326 #define PR_DEBUG(foo)   ((pr_debug.foo)++)
 327 
 328 /*
 329  * A type histogram. We record the incidence of the various toxic
 330  * flag combinations along with the interesting page attributes. The
 331  * goal is to get as many combinations as we can while driving all
 332  * pr_debug values nonzero (indicating we've exercised all possible
 333  * code paths across all possible page types). Not all combinations
 334  * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
 335  *
 336  * pr_type offset bit encoding (when examining with a debugger):
 337  *
 338  *    PRT_NAMED  - 0x4
 339  *    PRT_KERNEL - 0x8
 340  *    PRT_FREE   - 0x10
 341  *    PRT_MOD    - 0x20
 342  *    PRT_FMA    - 0x0
 343  *    PRT_MCE    - 0x40
 344  *    PRT_UE     - 0x80
 345  */
 346 
 347 #define PRT_NAMED       0x01
 348 #define PRT_KERNEL      0x02
 349 #define PRT_FREE        0x04
 350 #define PRT_MOD         0x08
 351 #define PRT_FMA         0x00    /* yes, this is not a mistake */
 352 #define PRT_MCE         0x10
 353 #define PRT_UE          0x20
 354 #define PRT_ALL         0x3F
 355 
 356 int pr_types[PRT_ALL+1];
 357 
 358 #define PR_TYPES(pp)    {                       \
 359         int whichtype = 0;                      \
 360         if (pp->p_vnode)                     \
 361                 whichtype |= PRT_NAMED;         \
 362         if (PP_ISKAS(pp))                       \
 363                 whichtype |= PRT_KERNEL;        \
 364         if (PP_ISFREE(pp))                      \
 365                 whichtype |= PRT_FREE;          \
 366         if (hat_ismod(pp))                      \
 367                 whichtype |= PRT_MOD;           \
 368         if (pp->p_toxic & PR_UE)         \
 369                 whichtype |= PRT_UE;            \
 370         if (pp->p_toxic & PR_MCE)                \
 371                 whichtype |= PRT_MCE;           \
 372         pr_types[whichtype]++;                  \
 373 }
 374 
 375 int recl_calls;
 376 int recl_mtbf = 3;
 377 int reloc_calls;
 378 int reloc_mtbf = 7;
 379 int pr_calls;
 380 int pr_mtbf = 15;
 381 
 382 #define MTBF(v, f)      (((++(v)) & (f)) != (f))
 383 
 384 #else   /* DEBUG */
 385 
 386 #define PR_DEBUG(foo)   /* nothing */
 387 #define PR_TYPES(foo)   /* nothing */
 388 #define MTBF(v, f)      (1)
 389 
 390 #endif  /* DEBUG */
 391 
 392 /*
 393  * page_retire_done() - completion processing
 394  *
 395  * Used by the page_retire code for common completion processing.
 396  * It keeps track of how many times a given result has happened,
 397  * and writes out an occasional message.
 398  *
 399  * May be called with a NULL pp (PRD_INVALID_PA case).
 400  */
 401 #define PRD_INVALID_KEY         -1
 402 #define PRD_SUCCESS             0
 403 #define PRD_PENDING             1
 404 #define PRD_FAILED              2
 405 #define PRD_DUPLICATE           3
 406 #define PRD_INVALID_PA          4
 407 #define PRD_LIMIT               5
 408 #define PRD_UE_SCRUBBED         6
 409 #define PRD_UNR_SUCCESS         7
 410 #define PRD_UNR_CANTLOCK        8
 411 #define PRD_UNR_NOT             9
 412 
 413 typedef struct page_retire_op {
 414         int     pr_key;         /* one of the PRD_* defines from above */
 415         int     pr_count;       /* How many times this has happened */
 416         int     pr_retval;      /* return value */
 417         int     pr_msglvl;      /* message level - when to print */
 418         char    *pr_message;    /* Cryptic message for field service */
 419 } page_retire_op_t;
 420 
 421 static page_retire_op_t page_retire_ops[] = {
 422         /* key                  count   retval  msglvl  message */
 423         {PRD_SUCCESS,           0,      0,      1,
 424                 "Page 0x%08x.%08x removed from service"},
 425         {PRD_PENDING,           0,      EAGAIN, 2,
 426                 "Page 0x%08x.%08x will be retired on free"},
 427         {PRD_FAILED,            0,      EAGAIN, 0, NULL},
 428         {PRD_DUPLICATE,         0,      EIO,    2,
 429                 "Page 0x%08x.%08x already retired or pending"},
 430         {PRD_INVALID_PA,        0,      EINVAL, 2,
 431                 "PA 0x%08x.%08x is not a relocatable page"},
 432         {PRD_LIMIT,             0,      0,      1,
 433                 "Page 0x%08x.%08x not retired due to limit exceeded"},
 434         {PRD_UE_SCRUBBED,       0,      0,      1,
 435                 "Previously reported error on page 0x%08x.%08x cleared"},
 436         {PRD_UNR_SUCCESS,       0,      0,      1,
 437                 "Page 0x%08x.%08x returned to service"},
 438         {PRD_UNR_CANTLOCK,      0,      EAGAIN, 2,
 439                 "Page 0x%08x.%08x could not be unretired"},
 440         {PRD_UNR_NOT,           0,      EIO,    2,
 441                 "Page 0x%08x.%08x is not retired"},
 442         {PRD_INVALID_KEY,       0,      0,      0, NULL} /* MUST BE LAST! */
 443 };
 444 
 445 /*
 446  * print a message if page_retire_messages is true.
 447  */
 448 #define PR_MESSAGE(debuglvl, msglvl, msg, pa)                           \
 449 {                                                                       \
 450         uint64_t p = (uint64_t)pa;                                      \
 451         if (page_retire_messages >= msglvl && msg != NULL) {         \
 452                 cmn_err(debuglvl, msg,                                  \
 453                     (uint32_t)(p >> 32), (uint32_t)p);                    \
 454         }                                                               \
 455 }
 456 
 457 /*
 458  * Note that multiple bits may be set in a single settoxic operation.
 459  * May be called without the page locked.
 460  */
 461 void
 462 page_settoxic(page_t *pp, uchar_t bits)
 463 {
 464         atomic_or_8(&pp->p_toxic, bits);
 465 }
 466 
 467 /*
 468  * Note that multiple bits may cleared in a single clrtoxic operation.
 469  * Must be called with the page exclusively locked to prevent races which
 470  * may attempt to retire a page without any toxic bits set.
 471  * Note that the PR_CAPTURE bit can be cleared without the exclusive lock
 472  * being held as there is a separate mutex which protects that bit.
 473  */
 474 void
 475 page_clrtoxic(page_t *pp, uchar_t bits)
 476 {
 477         ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp));
 478         atomic_and_8(&pp->p_toxic, ~bits);
 479 }
 480 
 481 /*
 482  * Prints any page retire messages to the user, and decides what
 483  * error code is appropriate for the condition reported.
 484  */
 485 static int
 486 page_retire_done(page_t *pp, int code)
 487 {
 488         page_retire_op_t *prop;
 489         uint64_t        pa = 0;
 490         int             i;
 491 
 492         if (pp != NULL) {
 493                 pa = mmu_ptob((uint64_t)pp->p_pagenum);
 494         }
 495 
 496         prop = NULL;
 497         for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
 498                 if (page_retire_ops[i].pr_key == code) {
 499                         prop = &page_retire_ops[i];
 500                         break;
 501                 }
 502         }
 503 
 504 #ifdef  DEBUG
 505         if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
 506                 cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
 507         }
 508 #endif
 509 
 510         ASSERT(prop->pr_key == code);
 511 
 512         prop->pr_count++;
 513 
 514         PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
 515         if (pp != NULL) {
 516                 page_settoxic(pp, PR_MSG);
 517         }
 518 
 519         return (prop->pr_retval);
 520 }
 521 
 522 /*
 523  * Act like page_destroy(), but instead of freeing the page, hash it onto
 524  * the retired_pages vnode, and mark it retired.
 525  *
 526  * For fun, we try to scrub the page until it's squeaky clean.
 527  * availrmem is adjusted here.
 528  */
 529 static void
 530 page_retire_destroy(page_t *pp)
 531 {
 532         u_offset_t off = (u_offset_t)((uintptr_t)pp);
 533 
 534         ASSERT(PAGE_EXCL(pp));
 535         ASSERT(!PP_ISFREE(pp));
 536         ASSERT(pp->p_szc == 0);
 537         ASSERT(!hat_page_is_mapped(pp));
 538         ASSERT(!pp->p_vnode);
 539 
 540         page_clr_all_props(pp);
 541         pagescrub(pp, 0, MMU_PAGESIZE);
 542 
 543         pp->p_next = NULL;
 544         pp->p_prev = NULL;
 545         if (page_hashin(pp, retired_pages, off, NULL) == 0) {
 546                 cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
 547         }
 548 
 549         page_settoxic(pp, PR_RETIRED);
 550         PR_INCR_KSTAT(pr_retired);
 551 
 552         if (pp->p_toxic & PR_FMA) {
 553                 PR_INCR_KSTAT(pr_fma);
 554         } else if (pp->p_toxic & PR_UE) {
 555                 PR_INCR_KSTAT(pr_ue);
 556         } else {
 557                 PR_INCR_KSTAT(pr_mce);
 558         }
 559 
 560         mutex_enter(&freemem_lock);
 561         availrmem--;
 562         mutex_exit(&freemem_lock);
 563 
 564         page_unlock(pp);
 565 }
 566 
 567 /*
 568  * Check whether the number of pages which have been retired already exceeds
 569  * the maximum allowable percentage of memory which may be retired.
 570  *
 571  * Returns 1 if the limit has been exceeded.
 572  */
 573 static int
 574 page_retire_limit(void)
 575 {
 576         if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
 577                 PR_INCR_KSTAT(pr_limit_exceeded);
 578                 return (1);
 579         }
 580 
 581         return (0);
 582 }
 583 
 584 #define MSG_DM  "Data Mismatch occurred at PA 0x%08x.%08x"              \
 585         "[ 0x%x != 0x%x ] while attempting to clear previously "        \
 586         "reported error; page removed from service"
 587 
 588 #define MSG_UE  "Uncorrectable Error occurred at PA 0x%08x.%08x while " \
 589         "attempting to clear previously reported error; page removed "  \
 590         "from service"
 591 
 592 /*
 593  * Attempt to clear a UE from a page.
 594  * Returns 1 if the error has been successfully cleared.
 595  */
 596 static int
 597 page_clear_transient_ue(page_t *pp)
 598 {
 599         caddr_t         kaddr;
 600         uint8_t         rb, wb;
 601         uint64_t        pa;
 602         uint32_t        pa_hi, pa_lo;
 603         on_trap_data_t  otd;
 604         int             errors = 0;
 605         int             i;
 606 
 607         ASSERT(PAGE_EXCL(pp));
 608         ASSERT(PP_PR_REQ(pp));
 609         ASSERT(pp->p_szc == 0);
 610         ASSERT(!hat_page_is_mapped(pp));
 611 
 612         /*
 613          * Clear the page and attempt to clear the UE.  If we trap
 614          * on the next access to the page, we know the UE has recurred.
 615          */
 616         pagescrub(pp, 0, PAGESIZE);
 617 
 618         /*
 619          * Map the page and write a bunch of bit patterns to compare
 620          * what we wrote with what we read back.  This isn't a perfect
 621          * test but it should be good enough to catch most of the
 622          * recurring UEs. If this fails to catch a recurrent UE, we'll
 623          * retire the page the next time we see a UE on the page.
 624          */
 625         kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
 626 
 627         pa = ptob((uint64_t)page_pptonum(pp));
 628         pa_hi = (uint32_t)(pa >> 32);
 629         pa_lo = (uint32_t)pa;
 630 
 631         /*
 632          * Disable preemption to prevent the off chance that
 633          * we migrate while in the middle of running through
 634          * the bit pattern and run on a different processor
 635          * than what we started on.
 636          */
 637         kpreempt_disable();
 638 
 639         /*
 640          * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
 641          * the cache in between reading and writing.  We do this under
 642          * on_trap() protection to avoid recursion.
 643          */
 644         if (on_trap(&otd, OT_DATA_EC)) {
 645                 PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
 646                 errors = 1;
 647         } else {
 648                 for (wb = 0xff; wb > 0; wb--) {
 649                         for (i = 0; i < PAGESIZE; i++) {
 650                                 kaddr[i] = wb;
 651                         }
 652 
 653                         sync_data_memory(kaddr, PAGESIZE);
 654 
 655                         for (i = 0; i < PAGESIZE; i++) {
 656                                 rb = kaddr[i];
 657                                 if (rb != wb) {
 658                                         /*
 659                                          * We had a mismatch without a trap.
 660                                          * Uh-oh. Something is really wrong
 661                                          * with this system.
 662                                          */
 663                                         if (page_retire_messages) {
 664                                                 cmn_err(CE_WARN, MSG_DM,
 665                                                     pa_hi, pa_lo, rb, wb);
 666                                         }
 667                                         errors = 1;
 668                                         goto out;       /* double break */
 669                                 }
 670                         }
 671                 }
 672         }
 673 out:
 674         no_trap();
 675         kpreempt_enable();
 676         ppmapout(kaddr);
 677 
 678         return (errors ? 0 : 1);
 679 }
 680 
 681 /*
 682  * Try to clear a page_t with a single UE. If the UE was transient, it is
 683  * returned to service, and we return 1. Otherwise we return 0 meaning
 684  * that further processing is required to retire the page.
 685  */
 686 static int
 687 page_retire_transient_ue(page_t *pp)
 688 {
 689         ASSERT(PAGE_EXCL(pp));
 690         ASSERT(!hat_page_is_mapped(pp));
 691 
 692         /*
 693          * If this page is a repeat offender, retire him under the
 694          * "two strikes and you're out" rule. The caller is responsible
 695          * for scrubbing the page to try to clear the error.
 696          */
 697         if (pp->p_toxic & PR_UE_SCRUBBED) {
 698                 PR_INCR_KSTAT(pr_ue_persistent);
 699                 return (0);
 700         }
 701 
 702         if (page_clear_transient_ue(pp)) {
 703                 /*
 704                  * We set the PR_SCRUBBED_UE bit; if we ever see this
 705                  * page again, we will retire it, no questions asked.
 706                  */
 707                 page_settoxic(pp, PR_UE_SCRUBBED);
 708 
 709                 if (page_retire_first_ue) {
 710                         PR_INCR_KSTAT(pr_ue_cleared_retire);
 711                         return (0);
 712                 } else {
 713                         PR_INCR_KSTAT(pr_ue_cleared_free);
 714 
 715                         page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG);
 716 
 717                         /* LINTED: CONSTCOND */
 718                         VN_DISPOSE(pp, B_FREE, 1, kcred);
 719                         return (1);
 720                 }
 721         }
 722 
 723         PR_INCR_KSTAT(pr_ue_persistent);
 724         return (0);
 725 }
 726 
 727 /*
 728  * Update the statistics dynamically when our kstat is read.
 729  */
 730 static int
 731 page_retire_kstat_update(kstat_t *ksp, int rw)
 732 {
 733         struct page_retire_kstat *pr;
 734 
 735         if (ksp == NULL)
 736                 return (EINVAL);
 737 
 738         switch (rw) {
 739 
 740         case KSTAT_READ:
 741                 pr = (struct page_retire_kstat *)ksp->ks_data;
 742                 ASSERT(pr == &page_retire_kstat);
 743                 pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
 744                 return (0);
 745 
 746         case KSTAT_WRITE:
 747                 return (EACCES);
 748 
 749         default:
 750                 return (EINVAL);
 751         }
 752         /*NOTREACHED*/
 753 }
 754 
 755 static int
 756 pr_list_kstat_update(kstat_t *ksp, int rw)
 757 {
 758         uint_t count;
 759         page_t *pp;
 760         kmutex_t *vphm;
 761 
 762         if (rw == KSTAT_WRITE)
 763                 return (EACCES);
 764 
 765         vphm = page_vnode_mutex(retired_pages);
 766         mutex_enter(vphm);
 767         /* Needs to be under a lock so that for loop will work right */
 768         if (retired_pages->v_pages == NULL) {
 769                 mutex_exit(vphm);
 770                 ksp->ks_ndata = 0;
 771                 ksp->ks_data_size = 0;
 772                 return (0);
 773         }
 774 
 775         count = 1;
 776         for (pp = retired_pages->v_pages->p_vpnext;
 777             pp != retired_pages->v_pages; pp = pp->p_vpnext) {
 778                 count++;
 779         }
 780         mutex_exit(vphm);
 781 
 782         ksp->ks_ndata = count;
 783         ksp->ks_data_size = count * 2 * sizeof (uint64_t);
 784 
 785         return (0);
 786 }
 787 
 788 /*
 789  * all spans will be pagesize and no coalescing will be done with the
 790  * list produced.
 791  */
 792 static int
 793 pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
 794 {
 795         kmutex_t *vphm;
 796         page_t *pp;
 797         struct memunit {
 798                 uint64_t address;
 799                 uint64_t size;
 800         } *kspmem;
 801 
 802         if (rw == KSTAT_WRITE)
 803                 return (EACCES);
 804 
 805         ksp->ks_snaptime = gethrtime();
 806 
 807         kspmem = (struct memunit *)buf;
 808 
 809         vphm = page_vnode_mutex(retired_pages);
 810         mutex_enter(vphm);
 811         pp = retired_pages->v_pages;
 812         if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) ||
 813             (pp == NULL)) {
 814                 mutex_exit(vphm);
 815                 return (0);
 816         }
 817         kspmem->address = ptob(pp->p_pagenum);
 818         kspmem->size = PAGESIZE;
 819         kspmem++;
 820         for (pp = pp->p_vpnext; pp != retired_pages->v_pages;
 821             pp = pp->p_vpnext, kspmem++) {
 822                 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
 823                         break;
 824                 kspmem->address = ptob(pp->p_pagenum);
 825                 kspmem->size = PAGESIZE;
 826         }
 827         mutex_exit(vphm);
 828 
 829         return (0);
 830 }
 831 
 832 /*
 833  * page_retire_pend_count -- helper function for page_capture_thread,
 834  * returns the number of pages pending retirement.
 835  */
 836 uint64_t
 837 page_retire_pend_count(void)
 838 {
 839         return (PR_KSTAT_PENDING);
 840 }
 841 
 842 uint64_t
 843 page_retire_pend_kas_count(void)
 844 {
 845         return (PR_KSTAT_PENDING_KAS);
 846 }
 847 
 848 void
 849 page_retire_incr_pend_count(void *datap)
 850 {
 851         PR_INCR_KSTAT(pr_pending);
 852 
 853         if ((datap == &kvp) || (datap == &zvp)) {
 854                 PR_INCR_KSTAT(pr_pending_kas);
 855         }
 856 }
 857 
 858 void
 859 page_retire_decr_pend_count(void *datap)
 860 {
 861         PR_DECR_KSTAT(pr_pending);
 862 
 863         if ((datap == &kvp) || (datap == &zvp)) {
 864                 PR_DECR_KSTAT(pr_pending_kas);
 865         }
 866 }
 867 
 868 /*
 869  * Initialize the page retire mechanism:
 870  *
 871  *   - Establish the correctable error retire limit.
 872  *   - Initialize locks.
 873  *   - Build the retired_pages vnode.
 874  *   - Set up the kstats.
 875  *   - Fire off the background thread.
 876  *   - Tell page_retire() it's OK to start retiring pages.
 877  */
 878 void
 879 page_retire_init(void)
 880 {
 881         const fs_operation_def_t retired_vnodeops_template[] = {
 882                 { NULL, NULL }
 883         };
 884         struct vnodeops *vops;
 885         kstat_t *ksp;
 886 
 887         const uint_t page_retire_ndata =
 888             sizeof (page_retire_kstat) / sizeof (kstat_named_t);
 889 
 890         ASSERT(page_retire_ksp == NULL);
 891 
 892         if (max_pages_retired_bps <= 0) {
 893                 max_pages_retired_bps = MCE_BPT;
 894         }
 895 
 896         mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
 897 
 898         retired_pages = vn_alloc(KM_SLEEP);
 899         if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
 900                 cmn_err(CE_PANIC,
 901                     "page_retired_init: can't make retired vnodeops");
 902         }
 903         vn_setops(retired_pages, vops);
 904 
 905         if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
 906             "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
 907             KSTAT_FLAG_VIRTUAL)) == NULL) {
 908                 cmn_err(CE_WARN, "kstat_create for page_retire failed");
 909         } else {
 910                 page_retire_ksp->ks_data = (void *)&page_retire_kstat;
 911                 page_retire_ksp->ks_update = page_retire_kstat_update;
 912                 kstat_install(page_retire_ksp);
 913         }
 914 
 915         mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
 916         ksp = kstat_create("unix", 0, "page_retire_list", "misc",
 917             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
 918         if (ksp != NULL) {
 919                 ksp->ks_update = pr_list_kstat_update;
 920                 ksp->ks_snapshot = pr_list_kstat_snapshot;
 921                 ksp->ks_lock = &pr_list_kstat_mutex;
 922                 kstat_install(ksp);
 923         }
 924 
 925         memscrub_notify_func =
 926             (void(*)(uint64_t))kobj_getsymvalue("memscrub_notify", 0);
 927 
 928         page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish);
 929         pr_enable = 1;
 930 }
 931 
 932 /*
 933  * page_retire_hunt() callback for the retire thread.
 934  */
 935 static void
 936 page_retire_thread_cb(page_t *pp)
 937 {
 938         PR_DEBUG(prd_tctop);
 939         if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) {
 940                 PR_DEBUG(prd_tclocked);
 941                 page_unlock(pp);
 942         }
 943 }
 944 
 945 /*
 946  * Callback used by page_trycapture() to finish off retiring a page.
 947  * The page has already been cleaned and we've been given sole access to
 948  * it.
 949  * Always returns 0 to indicate that callback succeded as the callback never
 950  * fails to finish retiring the given page.
 951  */
 952 /*ARGSUSED*/
 953 static int
 954 page_retire_pp_finish(page_t *pp, void *notused, uint_t flags)
 955 {
 956         int             toxic;
 957 
 958         ASSERT(PAGE_EXCL(pp));
 959         ASSERT(pp->p_iolock_state == 0);
 960         ASSERT(pp->p_szc == 0);
 961 
 962         toxic = pp->p_toxic;
 963 
 964         /*
 965          * The problem page is locked, demoted, unmapped, not free,
 966          * hashed out, and not COW or mlocked (whew!).
 967          *
 968          * Now we select our ammunition, take it around back, and shoot it.
 969          */
 970         if (toxic & PR_UE) {
 971 ue_error:
 972                 if (page_retire_transient_ue(pp)) {
 973                         PR_DEBUG(prd_uescrubbed);
 974                         (void) page_retire_done(pp, PRD_UE_SCRUBBED);
 975                 } else {
 976                         PR_DEBUG(prd_uenotscrubbed);
 977                         page_retire_destroy(pp);
 978                         (void) page_retire_done(pp, PRD_SUCCESS);
 979                 }
 980                 return (0);
 981         } else if (toxic & PR_FMA) {
 982                 PR_DEBUG(prd_fma);
 983                 page_retire_destroy(pp);
 984                 (void) page_retire_done(pp, PRD_SUCCESS);
 985                 return (0);
 986         } else if (toxic & PR_MCE) {
 987                 PR_DEBUG(prd_mce);
 988                 page_retire_destroy(pp);
 989                 (void) page_retire_done(pp, PRD_SUCCESS);
 990                 return (0);
 991         }
 992 
 993         /*
 994          * When page_retire_first_ue is set to zero and a UE occurs which is
 995          * transient, it's possible that we clear some flags set by a second
 996          * UE error on the page which occurs while the first is currently being
 997          * handled and thus we need to handle the case where none of the above
 998          * are set.  In this instance, PR_UE_SCRUBBED should be set and thus
 999          * we should execute the UE code above.
1000          */
1001         if (toxic & PR_UE_SCRUBBED) {
1002                 goto ue_error;
1003         }
1004 
1005         /*
1006          * It's impossible to get here.
1007          */
1008         panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic);
1009         return (0);
1010 }
1011 
1012 /*
1013  * page_retire() - the front door in to retire a page.
1014  *
1015  * Ideally, page_retire() would instantly retire the requested page.
1016  * Unfortunately, some pages are locked or otherwise tied up and cannot be
1017  * retired right away.  We use the page capture logic to deal with this
1018  * situation as it will continuously try to retire the page in the background
1019  * if the first attempt fails.  Success is determined by looking to see whether
1020  * the page has been retired after the page_trycapture() attempt.
1021  *
1022  * Returns:
1023  *
1024  *   - 0 on success,
1025  *   - EINVAL when the PA is whacko,
1026  *   - EIO if the page is already retired or already pending retirement, or
1027  *   - EAGAIN if the page could not be _immediately_ retired but is pending.
1028  */
1029 int
1030 page_retire(uint64_t pa, uchar_t reason)
1031 {
1032         page_t  *pp;
1033 
1034         ASSERT(reason & PR_REASONS);                /* there must be a reason */
1035         ASSERT(!(reason & ~PR_REASONS));    /* but no other bits */
1036 
1037         pp = page_numtopp_nolock(mmu_btop(pa));
1038         if (pp == NULL) {
1039                 PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
1040                     " page 0x%08x.%08x; page is not relocatable memory", pa);
1041                 return (page_retire_done(pp, PRD_INVALID_PA));
1042         }
1043         if (PP_RETIRED(pp)) {
1044                 PR_DEBUG(prd_dup1);
1045                 return (page_retire_done(pp, PRD_DUPLICATE));
1046         }
1047 
1048         if (memscrub_notify_func != NULL) {
1049                 (void) memscrub_notify_func(pa);
1050         }
1051 
1052         if ((reason & PR_UE) && !PP_TOXIC(pp)) {
1053                 PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
1054                     " page 0x%08x.%08x", pa);
1055         } else if (PP_PR_REQ(pp)) {
1056                 PR_DEBUG(prd_dup2);
1057                 return (page_retire_done(pp, PRD_DUPLICATE));
1058         } else {
1059                 PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
1060                     " page 0x%08x.%08x", pa);
1061         }
1062 
1063         /* Avoid setting toxic bits in the first place */
1064         if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) &&
1065             page_retire_limit()) {
1066                 return (page_retire_done(pp, PRD_LIMIT));
1067         }
1068 
1069         if (MTBF(pr_calls, pr_mtbf)) {
1070                 page_settoxic(pp, reason);
1071                 if (page_trycapture(pp, 0, CAPTURE_RETIRE, pp->p_vnode) == 0) {
1072                         PR_DEBUG(prd_prlocked);
1073                 } else {
1074                         PR_DEBUG(prd_prnotlocked);
1075                 }
1076         } else {
1077                 PR_DEBUG(prd_prnotlocked);
1078         }
1079 
1080         if (PP_RETIRED(pp)) {
1081                 PR_DEBUG(prd_prretired);
1082                 return (0);
1083         } else {
1084                 cv_signal(&pc_cv);
1085                 PR_INCR_KSTAT(pr_failed);
1086 
1087                 if (pp->p_toxic & PR_MSG) {
1088                         return (page_retire_done(pp, PRD_FAILED));
1089                 } else {
1090                         return (page_retire_done(pp, PRD_PENDING));
1091                 }
1092         }
1093 }
1094 
1095 /*
1096  * Take a retired page off the retired-pages vnode and clear the toxic flags.
1097  * If "free" is nonzero, lock it and put it back on the freelist. If "free"
1098  * is zero, the caller already holds SE_EXCL lock so we simply unretire it
1099  * and don't do anything else with it.
1100  *
1101  * Any unretire messages are printed from this routine.
1102  *
1103  * Returns 0 if page pp was unretired; else an error code.
1104  *
1105  * If flags is:
1106  *      PR_UNR_FREE - lock the page, clear the toxic flags and free it
1107  *          to the freelist.
1108  *      PR_UNR_TEMP - lock the page, unretire it, leave the toxic
1109  *          bits set as is and return it to the caller.
1110  *      PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the
1111  *          toxic flags and return it to caller as is.
1112  */
1113 int
1114 page_unretire_pp(page_t *pp, int flags)
1115 {
1116         /*
1117          * To be retired, a page has to be hashed onto the retired_pages vnode
1118          * and have PR_RETIRED set in p_toxic.
1119          */
1120         if (flags == PR_UNR_CLEAN ||
1121             page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
1122                 ASSERT(PAGE_EXCL(pp));
1123                 PR_DEBUG(prd_ulocked);
1124                 if (!PP_RETIRED(pp)) {
1125                         PR_DEBUG(prd_unotretired);
1126                         page_unlock(pp);
1127                         return (page_retire_done(pp, PRD_UNR_NOT));
1128                 }
1129 
1130                 PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
1131                     " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum));
1132                 if (pp->p_toxic & PR_FMA) {
1133                         PR_DECR_KSTAT(pr_fma);
1134                 } else if (pp->p_toxic & PR_UE) {
1135                         PR_DECR_KSTAT(pr_ue);
1136                 } else {
1137                         PR_DECR_KSTAT(pr_mce);
1138                 }
1139 
1140                 if (flags == PR_UNR_TEMP)
1141                         page_clrtoxic(pp, PR_RETIRED);
1142                 else
1143                         page_clrtoxic(pp, PR_TOXICFLAGS);
1144 
1145                 if (flags == PR_UNR_FREE) {
1146                         PR_DEBUG(prd_udestroy);
1147                         page_destroy(pp, 0);
1148                 } else {
1149                         PR_DEBUG(prd_uhashout);
1150                         page_hashout(pp, NULL);
1151                 }
1152 
1153                 mutex_enter(&freemem_lock);
1154                 availrmem++;
1155                 mutex_exit(&freemem_lock);
1156 
1157                 PR_DEBUG(prd_uunretired);
1158                 PR_DECR_KSTAT(pr_retired);
1159                 PR_INCR_KSTAT(pr_unretired);
1160                 return (page_retire_done(pp, PRD_UNR_SUCCESS));
1161         }
1162         PR_DEBUG(prd_unotlocked);
1163         return (page_retire_done(pp, PRD_UNR_CANTLOCK));
1164 }
1165 
1166 /*
1167  * Return a page to service by moving it from the retired_pages vnode
1168  * onto the freelist.
1169  *
1170  * Called from mmioctl_page_retire() on behalf of the FMA DE.
1171  *
1172  * Returns:
1173  *
1174  *   - 0 if the page is unretired,
1175  *   - EAGAIN if the pp can not be locked,
1176  *   - EINVAL if the PA is whacko, and
1177  *   - EIO if the pp is not retired.
1178  */
1179 int
1180 page_unretire(uint64_t pa)
1181 {
1182         page_t  *pp;
1183 
1184         pp = page_numtopp_nolock(mmu_btop(pa));
1185         if (pp == NULL) {
1186                 return (page_retire_done(pp, PRD_INVALID_PA));
1187         }
1188 
1189         return (page_unretire_pp(pp, PR_UNR_FREE));
1190 }
1191 
1192 /*
1193  * Test a page to see if it is retired. If errors is non-NULL, the toxic
1194  * bits of the page are returned. Returns 0 on success, error code on failure.
1195  */
1196 int
1197 page_retire_check_pp(page_t *pp, uint64_t *errors)
1198 {
1199         int rc;
1200 
1201         if (PP_RETIRED(pp)) {
1202                 PR_DEBUG(prd_checkhit);
1203                 rc = 0;
1204         } else if (PP_PR_REQ(pp)) {
1205                 PR_DEBUG(prd_checkmiss_pend);
1206                 rc = EAGAIN;
1207         } else {
1208                 PR_DEBUG(prd_checkmiss_noerr);
1209                 rc = EIO;
1210         }
1211 
1212         /*
1213          * We have magically arranged the bit values returned to fmd(1M)
1214          * to line up with the FMA, MCE, and UE bits of the page_t.
1215          */
1216         if (errors) {
1217                 uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK);
1218                 if (toxic & PR_UE_SCRUBBED) {
1219                         toxic &= ~PR_UE_SCRUBBED;
1220                         toxic |= PR_UE;
1221                 }
1222                 *errors = toxic;
1223         }
1224 
1225         return (rc);
1226 }
1227 
1228 /*
1229  * Test to see if the page_t for a given PA is retired, and return the
1230  * hardware errors we have seen on the page if requested.
1231  *
1232  * Called from mmioctl_page_retire on behalf of the FMA DE.
1233  *
1234  * Returns:
1235  *
1236  *   - 0 if the page is retired,
1237  *   - EIO if the page is not retired and has no errors,
1238  *   - EAGAIN if the page is not retired but is pending; and
1239  *   - EINVAL if the PA is whacko.
1240  */
1241 int
1242 page_retire_check(uint64_t pa, uint64_t *errors)
1243 {
1244         page_t  *pp;
1245 
1246         if (errors) {
1247                 *errors = 0;
1248         }
1249 
1250         pp = page_numtopp_nolock(mmu_btop(pa));
1251         if (pp == NULL) {
1252                 return (page_retire_done(pp, PRD_INVALID_PA));
1253         }
1254 
1255         return (page_retire_check_pp(pp, errors));
1256 }
1257 
1258 /*
1259  * Page retire self-test. For now, it always returns 0.
1260  */
1261 int
1262 page_retire_test(void)
1263 {
1264         page_t *first, *pp, *cpp, *cpp2, *lpp;
1265 
1266         /*
1267          * Tests the corner case where a large page can't be retired
1268          * because one of the constituent pages is locked. We mark
1269          * one page to be retired and try to retire it, and mark the
1270          * other page to be retired but don't try to retire it, so
1271          * that page_unlock() in the failure path will recurse and try
1272          * to retire THAT page. This is the worst possible situation
1273          * we can get ourselves into.
1274          */
1275         memsegs_lock(0);
1276         pp = first = page_first();
1277         do {
1278                 if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
1279                         cpp = pp + 1;
1280                         lpp = PP_ISFREE(pp)? pp : pp + 2;
1281                         cpp2 = pp + 3;
1282                         if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
1283                                 continue;
1284                         if (!page_trylock(cpp, SE_EXCL)) {
1285                                 page_unlock(lpp);
1286                                 continue;
1287                         }
1288 
1289                         /* fails */
1290                         (void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
1291 
1292                         page_unlock(lpp);
1293                         page_unlock(cpp);
1294                         (void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
1295                         (void) page_retire(ptob(cpp2->p_pagenum), PR_FMA);
1296                 }
1297         } while ((pp = page_next(pp)) != first);
1298         memsegs_unlock(0);
1299 
1300         return (0);
1301 }