1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include "assym.h"
  26 
  27 /*
  28  * General assembly language routines.
  29  * It is the intent of this file to contain routines that are
  30  * specific to cpu architecture.
  31  */
  32 
  33 /*
  34  * WARNING: If you add a fast trap handler which can be invoked by a
  35  * non-privileged user, you may have to use the FAST_TRAP_DONE macro
  36  * instead of "done" instruction to return back to the user mode. See
  37  * comments for the "fast_trap_done" entry point for more information.
  38  */
  39 #define FAST_TRAP_DONE  \
  40         ba,a    fast_trap_done
  41 
  42 /*
  43  * Override GET_NATIVE_TIME for the cpu module code.  This is not
  44  * guaranteed to be exactly one instruction, be careful of using
  45  * the macro in delay slots.
  46  *
  47  * Do not use any instruction that modifies condition codes as the 
  48  * caller may depend on these to remain unchanged across the macro.
  49  */
  50 #if defined(CHEETAH) || defined(OLYMPUS_C)
  51 
  52 #define GET_NATIVE_TIME(out, scr1, scr2) \
  53         rd      STICK, out
  54 #define DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
  55         rd      STICK, reg;             \
  56         add     reg, delta, reg;        \
  57         wr      reg, STICK
  58 #define RD_TICKCMPR(out, scr)           \
  59         rd      STICK_COMPARE, out
  60 #define WR_TICKCMPR(in, scr1, scr2, label) \
  61         wr      in, STICK_COMPARE
  62 
  63 #elif defined(HUMMINGBIRD)
  64 #include <sys/spitregs.h>
  65 
  66 /*
  67  * the current hummingbird version of %stick and %stick_cmp
  68  * were both implemented as (2) 32-bit locations in ASI_IO space;
  69  * the hdwr should support atomic r/w; meanwhile: ugly alert! ...
  70  *
  71  * 64-bit opcodes are required, but move only 32-bits:
  72  *
  73  * ldxa [phys]ASI_IO, %dst      reads  the low 32-bits from phys into %dst
  74  * stxa %src, [phys]ASI_IO      writes the low 32-bits from %src into phys
  75  *
  76  * reg equivalent               [phys]ASI_IO
  77  * ------------------           ---------------
  78  * %stick_cmp  low-32           0x1FE.0000.F060
  79  * %stick_cmp high-32           0x1FE.0000.F068
  80  * %stick      low-32           0x1FE.0000.F070
  81  * %stick     high-32           0x1FE.0000.F078
  82  */
  83 #define HSTC_LOW        0x60                    /* stick_cmp low  32-bits */
  84 #define HSTC_HIGH       0x68                    /* stick_cmp high 32-bits */
  85 #define HST_LOW         0x70                    /* stick low  32-bits */
  86 #define HST_HIGH        0x78                    /* stick high 32-bits */
  87 #define HST_DIFF        0x08                    /* low<-->high diff */
  88 
  89 /*
  90  * Any change in the number of instructions in SETL41()
  91  * will affect SETL41_OFF
  92  */
  93 #define SETL41(reg, byte) \
  94         sethi   %hi(0x1FE00000), reg;           /* 0000.0000.1FE0.0000 */ \
  95         or      reg, 0xF, reg;                  /* 0000.0000.1FE0.000F */ \
  96         sllx    reg, 12, reg;                   /* 0000.01FE.0000.F000 */ \
  97         or      reg, byte, reg;                 /* 0000.01FE.0000.F0xx */
  98 
  99 /*
 100  * SETL41_OFF is used to calulate the relative PC value when a
 101  * branch instruction needs to go over SETL41() macro
 102  */
 103 #define SETL41_OFF  16
 104 
 105 /*
 106  * reading stick requires 2 loads, and there could be an intervening
 107  * low-to-high 32-bit rollover resulting in a return value that is
 108  * off by about (2 ^ 32); this rare case is prevented by re-reading
 109  * the low-32 bits after the high-32 and verifying the "after" value
 110  * is >= the "before" value; if not, increment the high-32 value.
 111  *
 112  * this method is limited to 1 rollover, and based on the fixed
 113  * stick-frequency (5555555), requires the loads to complete within
 114  * 773 seconds; incrementing the high-32 value will not overflow for
 115  * about 52644 years.
 116  *
 117  * writing stick requires 2 stores; if the old/new low-32 value is
 118  * near 0xffffffff, there could be another rollover (also rare).
 119  * to prevent this, we first write a 0 to the low-32, then write
 120  * new values to the high-32 then the low-32.
 121  *
 122  * When we detect a carry in the lower %stick register, we need to
 123  * read HST_HIGH again. However at the point where we detect this,
 124  * we need to rebuild the register address HST_HIGH.This involves more
 125  * than one instructions and a branch is unavoidable. However, most of
 126  * the time, there is no carry. So we take the penalty of a branch
 127  * instruction only when there is carry (less frequent).
 128  * 
 129  * For GET_NATIVE_TIME(), we start afresh and branch to SETL41().
 130  * For DELTA_NATIVE_TIME(), we branch to just after SETL41() since
 131  * addr already points to HST_LOW.
 132  *
 133  * NOTE: this method requires disabling interrupts before using
 134  * DELTA_NATIVE_TIME.
 135  */
 136 #define GET_NATIVE_TIME(out, scr, tmp)  \
 137         SETL41(scr, HST_LOW);           \
 138         ldxa    [scr]ASI_IO, tmp;       \
 139         inc     HST_DIFF, scr;          \
 140         ldxa    [scr]ASI_IO, out;       \
 141         dec     HST_DIFF, scr;          \
 142         ldxa    [scr]ASI_IO, scr;       \
 143         sub     scr, tmp, tmp;          \
 144         brlz,pn tmp, .-(SETL41_OFF+24); \
 145         sllx    out, 32, out;           \
 146         or      out, scr, out
 147 #define DELTA_NATIVE_TIME(delta, addr, high, low, tmp) \
 148         SETL41(addr, HST_LOW);          \
 149         ldxa    [addr]ASI_IO, tmp;      \
 150         inc     HST_DIFF, addr;         \
 151         ldxa    [addr]ASI_IO, high;     \
 152         dec     HST_DIFF, addr;         \
 153         ldxa    [addr]ASI_IO, low;      \
 154         sub     low, tmp, tmp;          \
 155         brlz,pn tmp, .-24;              \
 156         sllx    high, 32, high;         \
 157         or      high, low, high;        \
 158         add     high, delta, high;      \
 159         srl     high, 0, low;           \
 160         srlx    high, 32, high;         \
 161         stxa    %g0, [addr]ASI_IO;      \
 162         inc     HST_DIFF, addr;         \
 163         stxa    high, [addr]ASI_IO;     \
 164         dec     HST_DIFF, addr;         \
 165         stxa    low, [addr]ASI_IO
 166 #define RD_TICKCMPR(out, scr)           \
 167         SETL41(scr, HSTC_LOW);          \
 168         ldxa    [scr]ASI_IO, out;       \
 169         inc     HST_DIFF, scr;          \
 170         ldxa    [scr]ASI_IO, scr;       \
 171         sllx    scr, 32, scr;           \
 172         or      scr, out, out
 173 #define WR_TICKCMPR(in, scra, scrd, label) \
 174         SETL41(scra, HSTC_HIGH);        \
 175         srlx    in, 32, scrd;           \
 176         stxa    scrd, [scra]ASI_IO;     \
 177         dec     HST_DIFF, scra;         \
 178         stxa    in, [scra]ASI_IO
 179 
 180 #else   /* !CHEETAH && !HUMMINGBIRD */
 181 
 182 #define GET_NATIVE_TIME(out, scr1, scr2) \
 183         rdpr    %tick, out
 184 #define DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
 185         rdpr    %tick, reg;             \
 186         add     reg, delta, reg;        \
 187         wrpr    reg, %tick
 188 #define RD_TICKCMPR(out, scr)           \
 189         rd      TICK_COMPARE, out
 190 #ifdef BB_ERRATA_1 /* writes to TICK_COMPARE may fail */
 191 /*
 192  * Writes to the TICK_COMPARE register sometimes fail on blackbird modules.
 193  * The failure occurs only when the following instruction decodes to wr or
 194  * wrpr.  The workaround is to immediately follow writes to TICK_COMPARE
 195  * with a read, thus stalling the pipe and keeping following instructions
 196  * from causing data corruption.  Aligning to a quadword will ensure these
 197  * two instructions are not split due to i$ misses.
 198  */
 199 #define WR_TICKCMPR(cmpr,scr1,scr2,label)       \
 200         ba,a    .bb_errata_1.label              ;\
 201         .align  64                              ;\
 202 .bb_errata_1.label:                             ;\
 203         wr      cmpr, TICK_COMPARE              ;\
 204         rd      TICK_COMPARE, %g0
 205 #else   /* BB_ERRATA_1 */
 206 #define WR_TICKCMPR(in,scr1,scr2,label)         \
 207         wr      in, TICK_COMPARE
 208 #endif  /* BB_ERRATA_1 */
 209 
 210 #endif  /* !CHEETAH && !HUMMINGBIRD */
 211 
 212 #include <sys/clock.h>
 213 
 214 
 215 #include <sys/asm_linkage.h>
 216 #include <sys/privregs.h>
 217 #include <sys/machparam.h>        /* To get SYSBASE and PAGESIZE */
 218 #include <sys/machthread.h>
 219 #include <sys/clock.h>
 220 #include <sys/intreg.h>
 221 #include <sys/psr_compat.h>
 222 #include <sys/isa_defs.h>
 223 #include <sys/dditypes.h>
 224 #include <sys/intr.h>
 225 
 226 #include "assym.h"
 227 
 228         ENTRY(get_impl)
 229         GET_CPU_IMPL(%o0)
 230         retl
 231         nop
 232         SET_SIZE(get_impl)
 233 
 234 /*
 235  * Softint generated when counter field of tick reg matches value field
 236  * of tick_cmpr reg
 237  */
 238         ENTRY_NP(tickcmpr_set)
 239         ! get 64-bit clock_cycles interval
 240         mov     %o0, %o2
 241         mov     8, %o3                  ! A reasonable initial step size
 242 1:
 243         WR_TICKCMPR(%o2,%o4,%o5,__LINE__)       ! Write to TICK_CMPR
 244 
 245         GET_NATIVE_TIME(%o0, %o4, %o5)  ! Read %tick to confirm the
 246         sllx    %o0, 1, %o0             !   value we wrote was in the future.
 247         srlx    %o0, 1, %o0
 248 
 249         cmp     %o2, %o0                ! If the value we wrote was in the
 250         bg,pt   %xcc, 2f                !   future, then blow out of here.
 251         sllx    %o3, 1, %o3             ! If not, then double our step size,
 252         ba,pt   %xcc, 1b                !   and take another lap.
 253         add     %o0, %o3, %o2           !
 254 2:
 255         retl
 256         nop
 257         SET_SIZE(tickcmpr_set)
 258 
 259         ENTRY_NP(tickcmpr_disable)
 260         mov     1, %g1
 261         sllx    %g1, TICKINT_DIS_SHFT, %o0
 262         WR_TICKCMPR(%o0,%o4,%o5,__LINE__)       ! Write to TICK_CMPR
 263         retl
 264         nop
 265         SET_SIZE(tickcmpr_disable)
 266 
 267 #ifdef DEBUG
 268         .seg    ".text"
 269 tick_write_panic:
 270         .asciz  "tick_write_delta: interrupts already disabled on entry"
 271 #endif  /* DEBUG */
 272 
 273 /*
 274  * tick_write_delta() increments %tick by the specified delta.  This should
 275  * only be called after a CPR event to assure that gethrtime() continues to
 276  * increase monotonically.  Obviously, writing %tick needs to de done very
 277  * carefully to avoid introducing unnecessary %tick skew across CPUs.  For
 278  * this reason, we make sure we're i-cache hot before actually writing to
 279  * %tick.
 280  */
 281         ENTRY_NP(tick_write_delta)
 282         rdpr    %pstate, %g1
 283 #ifdef DEBUG
 284         andcc   %g1, PSTATE_IE, %g0     ! If DEBUG, check that interrupts
 285         bnz     0f                      ! aren't already disabled.
 286         sethi   %hi(tick_write_panic), %o1
 287         save    %sp, -SA(MINFRAME), %sp ! get a new window to preserve caller
 288         call    panic
 289         or      %i1, %lo(tick_write_panic), %o0
 290 #endif  /* DEBUG */
 291 0:      wrpr    %g1, PSTATE_IE, %pstate ! Disable interrupts
 292         mov     %o0, %o2
 293         ba      0f                      ! Branch to cache line-aligned instr.
 294         nop
 295         .align  16
 296 0:      nop                             ! The next 3 instructions are now hot.
 297         DELTA_NATIVE_TIME(%o2, %o3, %o4, %o5, %g2)      ! read/inc/write %tick
 298 
 299         retl                            ! Return
 300         wrpr    %g0, %g1, %pstate       !     delay: Re-enable interrupts
 301 
 302         ENTRY_NP(tickcmpr_disabled)
 303         RD_TICKCMPR(%g1, %o0)
 304         retl
 305         srlx    %g1, TICKINT_DIS_SHFT, %o0
 306         SET_SIZE(tickcmpr_disabled)
 307 
 308 /*
 309  * Get current tick
 310  */
 311 
 312         ENTRY(gettick)
 313         ALTENTRY(randtick)
 314         GET_NATIVE_TIME(%o0, %o2, %o3)
 315         retl
 316         nop
 317         SET_SIZE(randtick)
 318         SET_SIZE(gettick)
 319 
 320 
 321 /*
 322  * Return the counter portion of the tick register.
 323  */
 324 
 325         ENTRY_NP(gettick_counter)
 326         rdpr    %tick, %o0
 327         sllx    %o0, 1, %o0
 328         retl
 329         srlx    %o0, 1, %o0             ! shake off npt bit
 330         SET_SIZE(gettick_counter)
 331 
 332 /*
 333  * Provide a C callable interface to the trap that reads the hi-res timer.
 334  * Returns 64-bit nanosecond timestamp in %o0 and %o1.
 335  */
 336 
 337         ENTRY_NP(gethrtime)
 338         GET_HRTIME(%g1, %o0, %o1, %o2, %o3, %o4, %o5, %g2)
 339                                                         ! %g1 = hrtime
 340         retl
 341         mov     %g1, %o0
 342         SET_SIZE(gethrtime)
 343 
 344         ENTRY_NP(gethrtime_unscaled)
 345         GET_NATIVE_TIME(%g1, %o2, %o3)                  ! %g1 = native time
 346         retl
 347         mov     %g1, %o0
 348         SET_SIZE(gethrtime_unscaled)
 349 
 350         ENTRY_NP(gethrtime_waitfree)
 351         ALTENTRY(dtrace_gethrtime)
 352         GET_NATIVE_TIME(%g1, %o2, %o3)                  ! %g1 = native time
 353         NATIVE_TIME_TO_NSEC(%g1, %o2, %o3)
 354         retl
 355         mov     %g1, %o0
 356         SET_SIZE(dtrace_gethrtime)
 357         SET_SIZE(gethrtime_waitfree)
 358 
 359         ENTRY(gethrtime_max)
 360         NATIVE_TIME_MAX(%g1)
 361         NATIVE_TIME_TO_NSEC(%g1, %o0, %o1)
 362 
 363         ! hrtime_t's are signed, max hrtime_t must be positive
 364         mov     -1, %o2
 365         brlz,a  %g1, 1f
 366         srlx    %o2, 1, %g1
 367 1:
 368         retl
 369         mov     %g1, %o0
 370         SET_SIZE(gethrtime_max)
 371 
 372         ENTRY(scalehrtime)
 373         ldx     [%o0], %o1
 374         NATIVE_TIME_TO_NSEC(%o1, %o2, %o3)
 375         retl
 376         stx     %o1, [%o0]
 377         SET_SIZE(scalehrtime)
 378 
 379 /*
 380  * Fast trap to return a timestamp, uses trap window, leaves traps
 381  * disabled.  Returns a 64-bit nanosecond timestamp in %o0 and %o1.
 382  *
 383  * This is the handler for the ST_GETHRTIME trap.
 384  */
 385 
 386         ENTRY_NP(get_timestamp)
 387         GET_HRTIME(%g1, %g2, %g3, %g4, %g5, %o0, %o1, %o2)      ! %g1 = hrtime
 388         srlx    %g1, 32, %o0                            ! %o0 = hi32(%g1)
 389         srl     %g1, 0, %o1                             ! %o1 = lo32(%g1)
 390         FAST_TRAP_DONE
 391         SET_SIZE(get_timestamp)
 392 
 393 /*
 394  * Macro to convert GET_HRESTIME() bits into a timestamp.
 395  *
 396  * We use two separate macros so that the platform-dependent GET_HRESTIME()
 397  * can be as small as possible; CONV_HRESTIME() implements the generic part.
 398  */
 399 #define CONV_HRESTIME(hrestsec, hrestnsec, adj, nslt, nano) \
 400         brz,pt  adj, 3f;                /* no adjustments, it's easy */ \
 401         add     hrestnsec, nslt, hrestnsec; /* hrest.tv_nsec += nslt */ \
 402         brlz,pn adj, 2f;                /* if hrestime_adj negative */  \
 403         srlx    nslt, ADJ_SHIFT, nslt;  /* delay: nslt >>= 4 */           \
 404         subcc   adj, nslt, %g0;         /* hrestime_adj - nslt/16 */    \
 405         movg    %xcc, nslt, adj;        /* adj by min(adj, nslt/16) */  \
 406         ba      3f;                     /* go convert to sec/nsec */    \
 407         add     hrestnsec, adj, hrestnsec; /* delay: apply adjustment */ \
 408 2:      addcc   adj, nslt, %g0;         /* hrestime_adj + nslt/16 */    \
 409         bge,a,pt %xcc, 3f;              /* is adj less negative? */     \
 410         add     hrestnsec, adj, hrestnsec; /* yes: hrest.nsec += adj */ \
 411         sub     hrestnsec, nslt, hrestnsec; /* no: hrest.nsec -= nslt/16 */ \
 412 3:      cmp     hrestnsec, nano;        /* more than a billion? */      \
 413         bl,pt   %xcc, 4f;               /* if not, we're done */        \
 414         nop;                            /* delay: do nothing :( */      \
 415         add     hrestsec, 1, hrestsec;  /* hrest.tv_sec++; */           \
 416         sub     hrestnsec, nano, hrestnsec; /* hrest.tv_nsec -= NANOSEC; */ \
 417         ba,a    3b;                     /* check >= billion again */ \
 418 4:
 419 
 420         ENTRY_NP(gethrestime)
 421         GET_HRESTIME(%o1, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
 422         CONV_HRESTIME(%o1, %o2, %o3, %o4, %o5)
 423         stn     %o1, [%o0]
 424         retl
 425         stn     %o2, [%o0 + CLONGSIZE]
 426         SET_SIZE(gethrestime)
 427 
 428 /*
 429  * Similar to gethrestime(), but gethrestime_sec() returns current hrestime
 430  * seconds.
 431  */
 432         ENTRY_NP(gethrestime_sec)
 433         GET_HRESTIME(%o0, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
 434         CONV_HRESTIME(%o0, %o2, %o3, %o4, %o5)
 435         retl                                    ! %o0 current hrestime seconds
 436         nop
 437         SET_SIZE(gethrestime_sec)
 438 
 439 /*
 440  * Returns the hrestime on the last tick.  This is simpler than gethrestime()
 441  * and gethrestime_sec():  no conversion is required.  gethrestime_lasttick()
 442  * follows the same locking algorithm as GET_HRESTIME and GET_HRTIME,
 443  * outlined in detail in clock.h.  (Unlike GET_HRESTIME/GET_HRTIME, we don't
 444  * rely on load dependencies to effect the membar #LoadLoad, instead declaring
 445  * it explicitly.)
 446  */
 447         ENTRY_NP(gethrestime_lasttick)
 448         sethi   %hi(hres_lock), %o1
 449 0:
 450         lduw    [%o1 + %lo(hres_lock)], %o2     ! Load lock value
 451         membar  #LoadLoad                       ! Load of lock must complete
 452         andn    %o2, 1, %o2                     ! Mask off lowest bit   
 453         ldn     [%o1 + %lo(hrestime)], %g1      ! Seconds.
 454         add     %o1, %lo(hrestime), %o4
 455         ldn     [%o4 + CLONGSIZE], %g2          ! Nanoseconds.
 456         membar  #LoadLoad                       ! All loads must complete
 457         lduw    [%o1 + %lo(hres_lock)], %o3     ! Reload lock value
 458         cmp     %o3, %o2                        ! If lock is locked or has
 459         bne     0b                              !   changed, retry.
 460         stn     %g1, [%o0]                      ! Delay: store seconds
 461         retl
 462         stn     %g2, [%o0 + CLONGSIZE]          ! Delay: store nanoseconds
 463         SET_SIZE(gethrestime_lasttick)
 464 
 465 /*
 466  * Fast trap for gettimeofday().  Returns a timestruc_t in %o0 and %o1.
 467  *
 468  * This is the handler for the ST_GETHRESTIME trap.
 469  */
 470 
 471         ENTRY_NP(get_hrestime)
 472         GET_HRESTIME(%o0, %o1, %g1, %g2, %g3, %g4, %g5, %o2, %o3)
 473         CONV_HRESTIME(%o0, %o1, %g1, %g2, %g3)
 474         FAST_TRAP_DONE
 475         SET_SIZE(get_hrestime)
 476 
 477 /*
 478  * Fast trap to return lwp virtual time, uses trap window, leaves traps
 479  * disabled.  Returns a 64-bit number in %o0:%o1, which is the number
 480  * of nanoseconds consumed.
 481  *
 482  * This is the handler for the ST_GETHRVTIME trap.
 483  *
 484  * Register usage:
 485  *      %o0, %o1 = return lwp virtual time
 486  *      %o2 = CPU/thread
 487  *      %o3 = lwp
 488  *      %g1 = scratch
 489  *      %g5 = scratch
 490  */
 491         ENTRY_NP(get_virtime)
 492         GET_NATIVE_TIME(%g5, %g1, %g2)  ! %g5 = native time in ticks
 493         CPU_ADDR(%g2, %g3)                      ! CPU struct ptr to %g2
 494         ldn     [%g2 + CPU_THREAD], %g2         ! thread pointer to %g2
 495         ldn     [%g2 + T_LWP], %g3              ! lwp pointer to %g3
 496 
 497         /*
 498          * Subtract start time of current microstate from time
 499          * of day to get increment for lwp virtual time.
 500          */
 501         ldx     [%g3 + LWP_STATE_START], %g1    ! ms_state_start
 502         sub     %g5, %g1, %g5
 503 
 504         /*
 505          * Add current value of ms_acct[LMS_USER]
 506          */
 507         ldx     [%g3 + LWP_ACCT_USER], %g1      ! ms_acct[LMS_USER]
 508         add     %g5, %g1, %g5
 509         NATIVE_TIME_TO_NSEC(%g5, %g1, %o0) 
 510         
 511         srl     %g5, 0, %o1                     ! %o1 = lo32(%g5)
 512         srlx    %g5, 32, %o0                    ! %o0 = hi32(%g5)
 513 
 514         FAST_TRAP_DONE
 515         SET_SIZE(get_virtime)
 516 
 517 
 518 
 519         .seg    ".text"
 520 hrtime_base_panic:
 521         .asciz  "hrtime_base stepping back"
 522 
 523 
 524         ENTRY_NP(hres_tick)
 525         save    %sp, -SA(MINFRAME), %sp ! get a new window
 526 
 527         sethi   %hi(hrestime), %l4
 528         ldstub  [%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5  ! try locking
 529 7:      tst     %l5
 530         bz,pt   %xcc, 8f                        ! if we got it, drive on
 531         ld      [%l4 + %lo(nsec_scale)], %l5    ! delay: %l5 = scaling factor
 532         ldub    [%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
 533 9:      tst     %l5
 534         bz,a,pn %xcc, 7b
 535         ldstub  [%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
 536         ba,pt   %xcc, 9b
 537         ldub    [%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
 538 8:
 539         membar  #StoreLoad|#StoreStore
 540 
 541         !
 542         ! update hres_last_tick.  %l5 has the scaling factor (nsec_scale).
 543         !
 544         ldx     [%l4 + %lo(hrtime_base)], %g1   ! load current hrtime_base
 545         GET_NATIVE_TIME(%l0, %l3, %l6)          ! current native time
 546         stx     %l0, [%l4 + %lo(hres_last_tick)]! prev = current
 547         ! convert native time to nsecs
 548         NATIVE_TIME_TO_NSEC_SCALE(%l0, %l5, %l2, NSEC_SHIFT)
 549 
 550         sub     %l0, %g1, %i1                   ! get accurate nsec delta
 551 
 552         ldx     [%l4 + %lo(hrtime_base)], %l1   
 553         cmp     %l1, %l0
 554         bg,pn   %xcc, 9f
 555         nop
 556 
 557         stx     %l0, [%l4 + %lo(hrtime_base)]   ! update hrtime_base
 558 
 559         !
 560         ! apply adjustment, if any
 561         !
 562         ldx     [%l4 + %lo(hrestime_adj)], %l0  ! %l0 = hrestime_adj
 563         brz     %l0, 2f
 564                                                 ! hrestime_adj == 0 ?
 565                                                 ! yes, skip adjustments
 566         clr     %l5                             ! delay: set adj to zero
 567         tst     %l0                             ! is hrestime_adj >= 0 ?
 568         bge,pt  %xcc, 1f                        ! yes, go handle positive case
 569         srl     %i1, ADJ_SHIFT, %l5             ! delay: %l5 = adj
 570 
 571         addcc   %l0, %l5, %g0                   ! hrestime_adj < -adj ?
 572         bl,pt   %xcc, 2f                        ! yes, use current adj
 573         neg     %l5                             ! delay: %l5 = -adj
 574         ba,pt   %xcc, 2f
 575         mov     %l0, %l5                        ! no, so set adj = hrestime_adj
 576 1:
 577         subcc   %l0, %l5, %g0                   ! hrestime_adj < adj ?
 578         bl,a,pt %xcc, 2f                        ! yes, set adj = hrestime_adj
 579         mov     %l0, %l5                        ! delay: adj = hrestime_adj
 580 2:
 581         ldx     [%l4 + %lo(timedelta)], %l0     ! %l0 = timedelta
 582         sub     %l0, %l5, %l0                   ! timedelta -= adj
 583 
 584         stx     %l0, [%l4 + %lo(timedelta)]     ! store new timedelta
 585         stx     %l0, [%l4 + %lo(hrestime_adj)]  ! hrestime_adj = timedelta
 586 
 587         or      %l4, %lo(hrestime), %l2
 588         ldn     [%l2], %i2                      ! %i2:%i3 = hrestime sec:nsec
 589         ldn     [%l2 + CLONGSIZE], %i3
 590         add     %i3, %l5, %i3                   ! hrestime.nsec += adj
 591         add     %i3, %i1, %i3                   ! hrestime.nsec += nslt
 592 
 593         set     NANOSEC, %l5                    ! %l5 = NANOSEC
 594         cmp     %i3, %l5
 595         bl,pt   %xcc, 5f                        ! if hrestime.tv_nsec < NANOSEC
 596         sethi   %hi(one_sec), %i1               ! delay
 597         add     %i2, 0x1, %i2                   ! hrestime.tv_sec++
 598         sub     %i3, %l5, %i3                   ! hrestime.tv_nsec - NANOSEC
 599         mov     0x1, %l5
 600         st      %l5, [%i1 + %lo(one_sec)]
 601 5:
 602         stn     %i2, [%l2]
 603         stn     %i3, [%l2 + CLONGSIZE]          ! store the new hrestime
 604 
 605         membar  #StoreStore
 606 
 607         ld      [%l4 + %lo(hres_lock)], %i1
 608         inc     %i1                             ! release lock
 609         st      %i1, [%l4 + %lo(hres_lock)]     ! clear hres_lock
 610 
 611         ret
 612         restore
 613 
 614 9:
 615         !
 616         ! release hres_lock
 617         !
 618         ld      [%l4 + %lo(hres_lock)], %i1
 619         inc     %i1
 620         st      %i1, [%l4 + %lo(hres_lock)]
 621 
 622         sethi   %hi(hrtime_base_panic), %o0
 623         call    panic
 624         or      %o0, %lo(hrtime_base_panic), %o0
 625 
 626         SET_SIZE(hres_tick)
 627 
 628         .seg    ".text"
 629 kstat_q_panic_msg:
 630         .asciz  "kstat_q_exit: qlen == 0"
 631 
 632         ENTRY(kstat_q_panic)
 633         save    %sp, -SA(MINFRAME), %sp
 634         sethi   %hi(kstat_q_panic_msg), %o0
 635         call    panic
 636         or      %o0, %lo(kstat_q_panic_msg), %o0
 637         /*NOTREACHED*/
 638         SET_SIZE(kstat_q_panic)
 639 
 640 #define BRZPN   brz,pn
 641 #define BRZPT   brz,pt
 642 
 643 #define KSTAT_Q_UPDATE(QOP, QBR, QZERO, QRETURN, QTYPE) \
 644         ld      [%o0 + QTYPE/**/CNT], %o1;      /* %o1 = old qlen */    \
 645         QOP     %o1, 1, %o2;                    /* %o2 = new qlen */    \
 646         QBR     %o1, QZERO;                     /* done if qlen == 0 */ \
 647         st      %o2, [%o0 + QTYPE/**/CNT];      /* delay: save qlen */  \
 648         ldx     [%o0 + QTYPE/**/LASTUPDATE], %o3;                       \
 649         ldx     [%o0 + QTYPE/**/TIME], %o4;     /* %o4 = old time */    \
 650         ldx     [%o0 + QTYPE/**/LENTIME], %o5;  /* %o5 = old lentime */ \
 651         sub     %g1, %o3, %o2;                  /* %o2 = time delta */  \
 652         mulx    %o1, %o2, %o3;                  /* %o3 = cur lentime */ \
 653         add     %o4, %o2, %o4;                  /* %o4 = new time */    \
 654         add     %o5, %o3, %o5;                  /* %o5 = new lentime */ \
 655         stx     %o4, [%o0 + QTYPE/**/TIME];     /* save time */         \
 656         stx     %o5, [%o0 + QTYPE/**/LENTIME];  /* save lentime */      \
 657 QRETURN;                                                                \
 658         stx     %g1, [%o0 + QTYPE/**/LASTUPDATE]; /* lastupdate = now */
 659 
 660 #if !defined(DEBUG)
 661 /*
 662  * same as KSTAT_Q_UPDATE but without:
 663  * QBR     %o1, QZERO;
 664  * to be used only with non-debug build. mimics ASSERT() behaviour.
 665  */
 666 #define KSTAT_Q_UPDATE_ND(QOP, QRETURN, QTYPE) \
 667         ld      [%o0 + QTYPE/**/CNT], %o1;      /* %o1 = old qlen */    \
 668         QOP     %o1, 1, %o2;                    /* %o2 = new qlen */    \
 669         st      %o2, [%o0 + QTYPE/**/CNT];      /* delay: save qlen */  \
 670         ldx     [%o0 + QTYPE/**/LASTUPDATE], %o3;                       \
 671         ldx     [%o0 + QTYPE/**/TIME], %o4;     /* %o4 = old time */    \
 672         ldx     [%o0 + QTYPE/**/LENTIME], %o5;  /* %o5 = old lentime */ \
 673         sub     %g1, %o3, %o2;                  /* %o2 = time delta */  \
 674         mulx    %o1, %o2, %o3;                  /* %o3 = cur lentime */ \
 675         add     %o4, %o2, %o4;                  /* %o4 = new time */    \
 676         add     %o5, %o3, %o5;                  /* %o5 = new lentime */ \
 677         stx     %o4, [%o0 + QTYPE/**/TIME];     /* save time */         \
 678         stx     %o5, [%o0 + QTYPE/**/LENTIME];  /* save lentime */      \
 679 QRETURN;                                                                \
 680         stx     %g1, [%o0 + QTYPE/**/LASTUPDATE]; /* lastupdate = now */
 681 #endif
 682 
 683         .align 16
 684         ENTRY(kstat_waitq_enter)
 685         GET_NATIVE_TIME(%g1, %g2, %g3)
 686         KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
 687         SET_SIZE(kstat_waitq_enter)
 688 
 689         .align 16
 690         ENTRY(kstat_waitq_exit)
 691         GET_NATIVE_TIME(%g1, %g2, %g3)
 692 #if defined(DEBUG)
 693         KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_W)
 694 #else
 695         KSTAT_Q_UPDATE_ND(sub, retl, KSTAT_IO_W)
 696 #endif
 697         SET_SIZE(kstat_waitq_exit)
 698 
 699         .align 16
 700         ENTRY(kstat_runq_enter)
 701         GET_NATIVE_TIME(%g1, %g2, %g3)
 702         KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
 703         SET_SIZE(kstat_runq_enter)
 704 
 705         .align 16
 706         ENTRY(kstat_runq_exit)
 707         GET_NATIVE_TIME(%g1, %g2, %g3)
 708 #if defined(DEBUG)
 709         KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_R)
 710 #else
 711         KSTAT_Q_UPDATE_ND(sub, retl, KSTAT_IO_R)
 712 #endif
 713         SET_SIZE(kstat_runq_exit)
 714 
 715         .align 16
 716         ENTRY(kstat_waitq_to_runq)
 717         GET_NATIVE_TIME(%g1, %g2, %g3)
 718 #if defined(DEBUG)
 719         KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_W)
 720 #else
 721         KSTAT_Q_UPDATE_ND(sub, 1:, KSTAT_IO_W)
 722 #endif
 723         KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
 724         SET_SIZE(kstat_waitq_to_runq)
 725 
 726         .align 16
 727         ENTRY(kstat_runq_back_to_waitq)
 728         GET_NATIVE_TIME(%g1, %g2, %g3)
 729 #if defined(DEBUG)
 730         KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_R)
 731 #else
 732         KSTAT_Q_UPDATE_ND(sub, 1:, KSTAT_IO_R)
 733 #endif
 734         KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
 735         SET_SIZE(kstat_runq_back_to_waitq)
 736 
 737         /*
 738          *  -- WARNING --
 739          *
 740          * The following variables MUST be together on a 128-byte boundary.
 741          * In addition to the primary performance motivation (having them all
 742          * on the same cache line(s)), code here and in the GET*TIME() macros
 743          * assumes that they all have the same high 22 address bits (so
 744          * there's only one sethi).
 745          */
 746         .seg    ".data"
 747         .global timedelta, hres_last_tick, hrestime, hrestime_adj
 748         .global hres_lock, nsec_scale, hrtime_base, traptrace_use_stick
 749         .global nsec_shift, adj_shift
 750 
 751         /* XXX - above comment claims 128-bytes is necessary */
 752         .align  64
 753 timedelta:
 754         .word   0, 0            /* int64_t */
 755 hres_last_tick:
 756         .word   0, 0            /* hrtime_t */
 757 hrestime:
 758         .nword  0, 0            /* 2 longs */
 759 hrestime_adj:
 760         .word   0, 0            /* int64_t */
 761 hres_lock:
 762         .word   0
 763 nsec_scale:
 764         .word   0
 765 hrtime_base:
 766         .word   0, 0
 767 traptrace_use_stick:
 768         .word   0
 769 nsec_shift:
 770         .word   NSEC_SHIFT
 771 adj_shift:
 772         .word   ADJ_SHIFT
 773 
 774 
 775 /*
 776  * drv_usecwait(clock_t n)      [DDI/DKI - section 9F]
 777  * usec_delay(int n)            [compatibility - should go one day]
 778  * Delay by spinning.
 779  *
 780  * delay for n microseconds.  numbers <= 0 delay 1 usec
 781  *
 782  * With UltraSPARC-III the combination of supporting mixed-speed CPUs
 783  * and variable clock rate for power management requires that we
 784  * use %stick to implement this routine.
 785  *
 786  * For OPL platforms that support the "sleep" instruction, we
 787  * conditionally (ifdef'ed) insert a "sleep" instruction in
 788  * the loop. Note that theoritically we should have move (duplicated)
 789  * the code down to spitfire/us3/opl specific asm files - but this
 790  * is alot of code duplication just to add one "sleep" instruction.
 791  * We chose less code duplication for this.
 792  */
 793 
 794         ENTRY(drv_usecwait)
 795         ALTENTRY(usec_delay)
 796         brlez,a,pn %o0, 0f
 797         mov     1, %o0
 798 0:
 799         sethi   %hi(sticks_per_usec), %o1
 800         lduw    [%o1 + %lo(sticks_per_usec)], %o1
 801         mulx    %o1, %o0, %o1           ! Scale usec to ticks
 802         inc     %o1                     ! We don't start on a tick edge
 803         GET_NATIVE_TIME(%o2, %o3, %o4)
 804         add     %o1, %o2, %o1
 805 
 806 1:
 807 #ifdef  _OPL
 808         .word 0x81b01060                ! insert "sleep" instruction
 809 #endif /* _OPL */                       ! use byte code for now
 810         cmp     %o1, %o2
 811         GET_NATIVE_TIME(%o2, %o3, %o4)
 812         bgeu,pt %xcc, 1b
 813         nop
 814         retl
 815         nop
 816         SET_SIZE(usec_delay)
 817         SET_SIZE(drv_usecwait)
 818 
 819 /*
 820  * Level-14 interrupt prologue.
 821  */
 822         ENTRY_NP(pil14_interrupt)
 823         CPU_ADDR(%g1, %g2)
 824         rdpr    %pil, %g6                       ! %g6 = interrupted PIL
 825         stn     %g6, [%g1 + CPU_PROFILE_PIL]    ! record interrupted PIL
 826         rdpr    %tstate, %g6
 827         rdpr    %tpc, %g5
 828         btst    TSTATE_PRIV, %g6                ! trap from supervisor mode?
 829         bnz,a,pt %xcc, 1f
 830         stn     %g5, [%g1 + CPU_PROFILE_PC]     ! if so, record kernel PC
 831         stn     %g5, [%g1 + CPU_PROFILE_UPC]    ! if not, record user PC
 832         ba      pil_interrupt_common            ! must be large-disp branch
 833         stn     %g0, [%g1 + CPU_PROFILE_PC]     ! zero kernel PC
 834 1:      ba      pil_interrupt_common            ! must be large-disp branch
 835         stn     %g0, [%g1 + CPU_PROFILE_UPC]    ! zero user PC
 836         SET_SIZE(pil14_interrupt)
 837 
 838         ENTRY_NP(tick_rtt)
 839         !
 840         ! Load TICK_COMPARE into %o5; if bit 63 is set, then TICK_COMPARE is
 841         ! disabled.  If TICK_COMPARE is enabled, we know that we need to
 842         ! reenqueue the interrupt request structure.  We'll then check TICKINT
 843         ! in SOFTINT; if it's set, then we know that we were in a TICK_COMPARE
 844         ! interrupt.  In this case, TICK_COMPARE may have been rewritten
 845         ! recently; we'll compare %o5 to the current time to verify that it's
 846         ! in the future.  
 847         !
 848         ! Note that %o5 is live until after 1f.
 849         ! XXX - there is a subroutine call while %o5 is live!
 850         !
 851         RD_TICKCMPR(%o5, %g1)
 852         srlx    %o5, TICKINT_DIS_SHFT, %g1
 853         brnz,pt %g1, 2f
 854         nop
 855 
 856         rdpr    %pstate, %g5
 857         andn    %g5, PSTATE_IE, %g1
 858         wrpr    %g0, %g1, %pstate               ! Disable vec interrupts
 859 
 860         sethi   %hi(cbe_level14_inum), %o1
 861         ldx     [%o1 + %lo(cbe_level14_inum)], %o1
 862         call    intr_enqueue_req ! preserves %o5 and %g5
 863         mov     PIL_14, %o0
 864 
 865         ! Check SOFTINT for TICKINT/STICKINT
 866         rd      SOFTINT, %o4
 867         set     (TICK_INT_MASK | STICK_INT_MASK), %o0
 868         andcc   %o4, %o0, %g0
 869         bz,a,pn %icc, 2f
 870         wrpr    %g0, %g5, %pstate               ! Enable vec interrupts
 871 
 872         ! clear TICKINT/STICKINT
 873         wr      %o0, CLEAR_SOFTINT
 874 
 875         !
 876         ! Now that we've cleared TICKINT, we can reread %tick and confirm
 877         ! that the value we programmed is still in the future.  If it isn't,
 878         ! we need to reprogram TICK_COMPARE to fire as soon as possible.
 879         !
 880         GET_NATIVE_TIME(%o0, %g1, %g2)          ! %o0 = tick
 881         sllx    %o0, 1, %o0                     ! Clear the DIS bit
 882         srlx    %o0, 1, %o0
 883         cmp     %o5, %o0                        ! In the future?
 884         bg,a,pt %xcc, 2f                        ! Yes, drive on.
 885         wrpr    %g0, %g5, %pstate               !   delay: enable vec intr
 886 
 887         !
 888         ! If we're here, then we have programmed TICK_COMPARE with a %tick
 889         ! which is in the past; we'll now load an initial step size, and loop
 890         ! until we've managed to program TICK_COMPARE to fire in the future.
 891         !
 892         mov     8, %o4                          ! 8 = arbitrary inital step
 893 1:      add     %o0, %o4, %o5                   ! Add the step
 894         WR_TICKCMPR(%o5,%g1,%g2,__LINE__)       ! Write to TICK_CMPR
 895         GET_NATIVE_TIME(%o0, %g1, %g2)          ! %o0 = tick
 896         sllx    %o0, 1, %o0                     ! Clear the DIS bit
 897         srlx    %o0, 1, %o0
 898         cmp     %o5, %o0                        ! In the future?
 899         bg,a,pt %xcc, 2f                        ! Yes, drive on.
 900         wrpr    %g0, %g5, %pstate               !    delay: enable vec intr
 901         ba      1b                              ! No, try again.
 902         sllx    %o4, 1, %o4                     !    delay: double step size
 903 
 904 2:      ba      current_thread_complete
 905         nop
 906         SET_SIZE(tick_rtt)
 907 
 908 /*
 909  * Level-15 interrupt prologue.
 910  */
 911        ENTRY_NP(pil15_interrupt)
 912        CPU_ADDR(%g1, %g2)
 913        rdpr    %tstate, %g6
 914        rdpr    %tpc, %g5
 915        btst    TSTATE_PRIV, %g6                ! trap from supervisor mode?
 916        bnz,a,pt %xcc, 1f
 917        stn     %g5, [%g1 + CPU_CPCPROFILE_PC]  ! if so, record kernel PC
 918        stn     %g5, [%g1 + CPU_CPCPROFILE_UPC] ! if not, record user PC
 919        ba      pil15_epilogue                  ! must be large-disp branch
 920        stn     %g0, [%g1 + CPU_CPCPROFILE_PC]  ! zero kernel PC
 921 1:     ba      pil15_epilogue                  ! must be large-disp branch
 922        stn     %g0, [%g1 + CPU_CPCPROFILE_UPC] ! zero user PC
 923        SET_SIZE(pil15_interrupt)
 924 
 925 #ifdef DEBUG
 926         .seg    ".text"
 927 find_cpufreq_panic:
 928         .asciz  "find_cpufrequency: interrupts already disabled on entry"
 929 #endif  /* DEBUG */
 930 
 931         ENTRY_NP(find_cpufrequency)
 932         rdpr    %pstate, %g1
 933 
 934 #ifdef DEBUG
 935         andcc   %g1, PSTATE_IE, %g0     ! If DEBUG, check that interrupts
 936         bnz     0f                      ! are currently enabled
 937         sethi   %hi(find_cpufreq_panic), %o1
 938         call    panic
 939         or      %o1, %lo(find_cpufreq_panic), %o0
 940 #endif  /* DEBUG */
 941 
 942 0:
 943         wrpr    %g1, PSTATE_IE, %pstate ! Disable interrupts
 944 3:
 945         ldub    [%o0], %o1              ! Read the number of seconds
 946         mov     %o1, %o2                ! remember initial value in %o2
 947 1:
 948         GET_NATIVE_TIME(%o3, %g4, %g5)
 949         cmp     %o1, %o2                ! did the seconds register roll over?
 950         be,pt   %icc, 1b                ! branch back if unchanged
 951         ldub    [%o0], %o2              !   delay: load the new seconds val
 952 
 953         brz,pn  %o2, 3b                 ! if the minutes just rolled over,
 954                                         ! the last second could have been
 955                                         ! inaccurate; try again.
 956         mov     %o2, %o4                !   delay: store init. val. in %o2
 957 2:
 958         GET_NATIVE_TIME(%o5, %g4, %g5)
 959         cmp     %o2, %o4                ! did the seconds register roll over?
 960         be,pt   %icc, 2b                ! branch back if unchanged
 961         ldub    [%o0], %o4              !   delay: load the new seconds val
 962 
 963         brz,pn  %o4, 0b                 ! if the minutes just rolled over,
 964                                         ! the last second could have been
 965                                         ! inaccurate; try again.
 966         wrpr    %g0, %g1, %pstate       !   delay: re-enable interrupts
 967 
 968         retl
 969         sub     %o5, %o3, %o0           ! return the difference in ticks
 970         SET_SIZE(find_cpufrequency)
 971 
 972 #if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
 973         defined(SERRANO)
 974         !
 975         ! On US-III, the prefetch instruction queue is 8 entries deep.
 976         ! Also, prefetches for write put data in the E$, which has
 977         ! lines of 512 bytes for an 8MB cache. Each E$ line is further
 978         ! subblocked into 64 byte chunks.
 979         !
 980         ! Since prefetch can only bring in 64 bytes at a time (See Sparc
 981         ! v9 Architecture Manual pp.204) and a page_t is 128 bytes,
 982         ! then 2 prefetches are required in order to bring an entire
 983         ! page into the E$.
 984         !
 985         ! Since the prefetch queue is 8 entries deep, we currently can
 986         ! only have 4 prefetches for page_t's outstanding. Thus, we
 987         ! prefetch n+4 ahead of where we are now: 
 988         !
 989         !      4 * sizeof(page_t)     -> 512
 990         !      4 * sizeof(page_t) +64 -> 576
 991         ! 
 992         ! Example
 993         ! =======
 994         ! contiguous page array in memory...
 995         !
 996         ! |AAA1|AAA2|BBB1|BBB2|CCC1|CCC2|DDD1|DDD2|XXX1|XXX2|YYY1|YYY2|...
 997         ! ^         ^         ^         ^         ^    ^
 998         ! pp                                      |    pp+4*sizeof(page)+64
 999         !                                         |
1000         !                                         pp+4*sizeof(page)
1001         !
1002         !  Prefetch
1003         !   Queue
1004         ! +-------+<--- In this iteration, we're working with pp (AAA1),
1005         ! |Preftch|     but we enqueue prefetch for addr = XXX1
1006         ! | XXX1  | 
1007         ! +-------+<--- this queue slot will be a prefetch instruction for
1008         ! |Preftch|     for addr = pp + 4*sizeof(page_t) + 64 (or second
1009         ! | XXX2  |     half of page XXX)
1010         ! +-------+ 
1011         ! |Preftch|<-+- The next time around this function, we'll be
1012         ! | YYY1  |  |  working with pp = BBB1, but will be enqueueing
1013         ! +-------+  |  prefetches to for both halves of page YYY,
1014         ! |Preftch|  |  while both halves of page XXX are in transit
1015         ! | YYY2  |<-+  make their way into the E$.
1016         ! +-------+
1017         ! |Preftch|
1018         ! | ZZZ1  |
1019         ! +-------+
1020         ! .       .
1021         ! :       :
1022         !
1023         !  E$
1024         ! +============================================...
1025         ! | XXX1 | XXX2 | YYY1 | YYY2 | ZZZ1 | ZZZ2 |
1026         ! +============================================...
1027         ! |      |      |      |      |      |      |
1028         ! +============================================...
1029         ! .
1030         ! :
1031         !
1032         ! So we should expect the first four page accesses to stall
1033         ! while we warm up the cache, afterwhich, most of the pages
1034         ! will have their pp ready in the E$.
1035         ! 
1036         ! Also note that if sizeof(page_t) grows beyond 128, then 
1037         ! we'll need an additional prefetch to get an entire page
1038         ! into the E$, thus reducing the number of outstanding page
1039         ! prefetches to 2 (ie. 3 prefetches/page = 6 queue slots)
1040         ! etc.
1041         !
1042         ! Cheetah+
1043         ! ========
1044         ! On Cheetah+ we use "#n_write" prefetches as these avoid
1045         ! unnecessary RTS->RTO bus transaction state change, and
1046         ! just issues RTO transaction. (See pp.77 of Cheetah+ Delta
1047         ! PRM). On Cheetah, #n_write prefetches are reflected with
1048         ! RTS->RTO state transition regardless.
1049         !
1050 #define STRIDE1 512
1051 #define STRIDE2 576
1052 
1053 #if     STRIDE1 != (PAGE_SIZE * 4)
1054 #error  "STRIDE1 != (PAGE_SIZE * 4)"
1055 #endif  /* STRIDE1 != (PAGE_SIZE * 4) */
1056 
1057 /*
1058  * Prefetch a page_t for write or read, this assumes a linear
1059  * scan of sequential page_t's.
1060  */
1061         ENTRY(prefetch_page_w)
1062         prefetch        [%o0+STRIDE1], #n_writes
1063         retl
1064         prefetch        [%o0+STRIDE2], #n_writes
1065         SET_SIZE(prefetch_page_w)
1066 
1067         !
1068         ! Note on CHEETAH to prefetch for read, we really use #one_write.
1069         ! This fetches to E$ (general use) rather than P$ (floating point use).
1070         !
1071         ENTRY(prefetch_page_r)
1072         prefetch        [%o0+STRIDE1], #one_write
1073         retl
1074         prefetch        [%o0+STRIDE2], #one_write
1075         SET_SIZE(prefetch_page_r)
1076 
1077 #elif defined(SPITFIRE) || defined(HUMMINGBIRD)
1078 
1079         !
1080         ! UltraSparcII can have up to 3 prefetches outstanding.
1081         ! A page_t is 128 bytes (2 prefetches of 64 bytes each)
1082         ! So prefetch for pp + 1, which is
1083         !
1084         !       pp + sizeof(page_t)
1085         ! and
1086         !       pp + sizeof(page_t) + 64
1087         !
1088 #define STRIDE1 128
1089 #define STRIDE2 192
1090 
1091 #if     STRIDE1 != PAGE_SIZE
1092 #error  "STRIDE1 != PAGE_SIZE"
1093 #endif  /* STRIDE1 != PAGE_SIZE */
1094 
1095         ENTRY(prefetch_page_w)
1096         prefetch        [%o0+STRIDE1], #n_writes
1097         retl
1098         prefetch        [%o0+STRIDE2], #n_writes
1099         SET_SIZE(prefetch_page_w)
1100 
1101         ENTRY(prefetch_page_r)
1102         prefetch        [%o0+STRIDE1], #n_reads
1103         retl
1104         prefetch        [%o0+STRIDE2], #n_reads
1105         SET_SIZE(prefetch_page_r)
1106 
1107 #elif defined(OLYMPUS_C)
1108         !
1109         ! Prefetch strides for Olympus-C
1110         !
1111 
1112 #define STRIDE1 0x440
1113 #define STRIDE2 0x640
1114         
1115         ENTRY(prefetch_page_w)
1116         prefetch        [%o0+STRIDE1], #n_writes
1117         retl
1118         prefetch        [%o0+STRIDE2], #n_writes
1119         SET_SIZE(prefetch_page_w)
1120 
1121         ENTRY(prefetch_page_r)
1122         prefetch        [%o0+STRIDE1], #n_writes
1123         retl
1124         prefetch        [%o0+STRIDE2], #n_writes
1125         SET_SIZE(prefetch_page_r)
1126 #else   /* OLYMPUS_C */
1127 
1128 #error "You need to fix this for your new cpu type."
1129 
1130 #endif  /* OLYMPUS_C */
1131 
1132 #if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
1133         defined(SERRANO)
1134 
1135 #define PREFETCH_Q_LEN 8
1136 
1137 #elif defined(SPITFIRE) || defined(HUMMINGBIRD)
1138 
1139 #define PREFETCH_Q_LEN 3
1140 
1141 #elif defined(OLYMPUS_C)
1142         !
1143         ! Use length of one for now.
1144         !
1145 #define PREFETCH_Q_LEN  1
1146 
1147 #else   /* OLYMPUS_C */
1148 
1149 #error You need to fix this for your new cpu type.
1150 
1151 #endif  /* OLYMPUS_C */
1152 
1153 #include <vm/kpm.h>
1154 
1155 #ifdef  SEGKPM_SUPPORT
1156 
1157 #define SMAP_SIZE 72
1158 #define SMAP_STRIDE (((PREFETCH_Q_LEN * 64) / SMAP_SIZE) * 64)
1159 
1160 #else   /* SEGKPM_SUPPORT */
1161 
1162         !
1163         ! The hardware will prefetch the 64 byte cache aligned block
1164         ! that contains the address specified in the prefetch instruction.
1165         ! Since the size of the smap struct is 48 bytes, issuing 1 prefetch
1166         ! per pass will suffice as long as we prefetch far enough ahead to
1167         ! make sure we don't stall for the cases where the smap object
1168         ! spans multiple hardware prefetch blocks.  Let's prefetch as far
1169         ! ahead as the hardware will allow.
1170         !
1171         ! The smap array is processed with decreasing address pointers.
1172         !
1173 #define SMAP_SIZE 48
1174 #define SMAP_STRIDE (PREFETCH_Q_LEN * SMAP_SIZE)
1175 
1176 #endif  /* SEGKPM_SUPPORT */
1177 
1178 /*
1179  * Prefetch struct smap for write.
1180  */
1181         ENTRY(prefetch_smap_w)
1182         retl
1183         prefetch        [%o0-SMAP_STRIDE], #n_writes
1184         SET_SIZE(prefetch_smap_w)
1185 
1186         ENTRY_NP(getidsr)
1187         retl
1188         ldxa    [%g0]ASI_INTR_DISPATCH_STATUS, %o0
1189         SET_SIZE(getidsr)
1190