3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Joyent, Inc.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 */
26
27 #include <sys/asm_linkage.h>
28 #include <sys/asm_misc.h>
29 #include <sys/regset.h>
30 #include <sys/privregs.h>
31 #include <sys/psw.h>
32 #include <sys/machbrand.h>
33
34 #if defined(__lint)
35
36 #include <sys/types.h>
37 #include <sys/thread.h>
38 #include <sys/systm.h>
39
40 #else /* __lint */
41
42 #include <sys/segments.h>
43 #include <sys/pcb.h>
474 /*
475 * Copy these registers here in case we end up stopped with
476 * someone (like, say, /proc) messing with our register state.
477 * We don't -restore- them unless we have to in update_sregs.
478 *
479 * Since userland -can't- change fsbase or gsbase directly,
480 * and capturing them involves two serializing instructions,
481 * we don't bother to capture them here.
482 */
483 xorl %ebx, %ebx
484 movw %ds, %bx
485 movq %rbx, REGOFF_DS(%rsp)
486 movw %es, %bx
487 movq %rbx, REGOFF_ES(%rsp)
488 movw %fs, %bx
489 movq %rbx, REGOFF_FS(%rsp)
490 movw %gs, %bx
491 movq %rbx, REGOFF_GS(%rsp)
492
493 /*
494 * Machine state saved in the regs structure on the stack
495 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
496 * %eax is the syscall number
497 * %rsp is the thread's stack, %r15 is curthread
498 * REG_RSP(%rsp) is the user's stack
499 */
500
501 SYSCALL_TRAPTRACE($TT_SYSC64)
502
503 movq %rsp, %rbp
504
505 movq T_LWP(%r15), %r14
506 ASSERT_NO_RUPDATE_PENDING(%r14)
507 ENABLE_INTR_FLAGS
508
509 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
510 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
511
512 ASSERT_LWPTOREGS(%r14, %rsp)
513
654 movq %gs:CPU_THREAD, %r11
655 movq T_STACK(%r11), %rsp
656
657 movq %rcx, REGOFF_RIP(%rsp)
658 movl $UCS_SEL, REGOFF_CS(%rsp)
659 movq %gs:CPU_RTMP_RSP, %r11
660 movq %r11, REGOFF_RSP(%rsp)
661 pushfq
662 popq %r11 /* hypercall enables ints */
663 movq %r11, REGOFF_RFL(%rsp)
664 movl $UDS_SEL, REGOFF_SS(%rsp)
665 addq $REGOFF_RIP, %rsp
666 /*
667 * XXPV: see comment in SYSRETQ definition for future optimization
668 * we could take.
669 */
670 ASSERT_UPCALL_MASK_IS_SET
671 SYSRETQ
672 #else
673 ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
674 SWAPGS /* user gsbase */
675 SYSRETQ
676 #endif
677 /*NOTREACHED*/
678 SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
679
680 _syscall_pre:
681 call pre_syscall
682 movl %eax, %r12d
683 testl %eax, %eax
684 jne _syscall_post_call
685 /*
686 * Didn't abort, so reload the syscall args and invoke the handler.
687 */
688 movzwl T_SYSNUM(%r15), %eax
689 jmp _syscall_invoke
690
691 _syscall_ill:
692 call nosys
693 movq %rax, %r12
694 movq %rdx, %r13
695 jmp _syscall_post_call
756
757 /*
758 * Copy these registers here in case we end up stopped with
759 * someone (like, say, /proc) messing with our register state.
760 * We don't -restore- them unless we have to in update_sregs.
761 *
762 * Since userland -can't- change fsbase or gsbase directly,
763 * we don't bother to capture them here.
764 */
765 xorl %ebx, %ebx
766 movw %ds, %bx
767 movq %rbx, REGOFF_DS(%rsp)
768 movw %es, %bx
769 movq %rbx, REGOFF_ES(%rsp)
770 movw %fs, %bx
771 movq %rbx, REGOFF_FS(%rsp)
772 movw %gs, %bx
773 movq %rbx, REGOFF_GS(%rsp)
774
775 /*
776 * Application state saved in the regs structure on the stack
777 * %eax is the syscall number
778 * %rsp is the thread's stack, %r15 is curthread
779 * REG_RSP(%rsp) is the user's stack
780 */
781
782 SYSCALL_TRAPTRACE32($TT_SYSC)
783
784 movq %rsp, %rbp
785
786 movq T_LWP(%r15), %r14
787 ASSERT_NO_RUPDATE_PENDING(%r14)
788
789 ENABLE_INTR_FLAGS
790
791 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
792 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
793
794 ASSERT_LWPTOREGS(%r14, %rsp)
795
872 /*
873 * To get back to userland, we need to put the return %rip in %rcx and
874 * the return %rfl in %r11d. The sysret instruction also arranges
875 * to fix up %cs and %ss; everything else is our responsibility.
876 */
877
878 movl %r12d, %eax /* %eax: rval1 */
879 movl REGOFF_RBX(%rsp), %ebx
880 /* %ecx used for return pointer */
881 movl %r13d, %edx /* %edx: rval2 */
882 movl REGOFF_RBP(%rsp), %ebp
883 movl REGOFF_RSI(%rsp), %esi
884 movl REGOFF_RDI(%rsp), %edi
885
886 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */
887 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */
888 movl REGOFF_RSP(%rsp), %esp
889
890 ASSERT_UPCALL_MASK_IS_SET
891 ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
892 SWAPGS /* user gsbase */
893 SYSRETL
894 SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
895 /*NOTREACHED*/
896
897 _full_syscall_postsys32:
898 STI
899 /*
900 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
901 * so that we can account for the extra work it takes us to finish.
902 */
903 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
904 movq %r15, %rdi
905 movq %r12, %rsi /* rval1 - %eax */
906 movq %r13, %rdx /* rval2 - %edx */
907 call syscall_exit
908 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
909 jmp _sys_rtt
910 SET_SIZE(sys_syscall32)
911 SET_SIZE(brand_sys_syscall32)
912
913 #endif /* __lint */
918 *
919 * The caller in userland has arranged that:
920 *
921 * - %eax contains the syscall number
922 * - %ecx contains the user %esp
923 * - %edx contains the return %eip
924 * - the user stack contains the args to the syscall
925 *
926 * Hardware and (privileged) initialization code have arranged that by
927 * the time the sysenter instructions completes:
928 *
929 * - %rip is pointing to sys_sysenter (below).
930 * - %cs and %ss are set to kernel text and stack (data) selectors.
931 * - %rsp is pointing at the lwp's stack
932 * - interrupts have been disabled.
933 *
934 * Note that we are unable to return both "rvals" to userland with
935 * this call, as %edx is used by the sysexit instruction.
936 *
937 * One final complication in this routine is its interaction with
938 * single-stepping in a debugger. For most of the system call mechanisms,
939 * the CPU automatically clears the single-step flag before we enter the
940 * kernel. The sysenter mechanism does not clear the flag, so a user
941 * single-stepping through a libc routine may suddenly find themself
942 * single-stepping through the kernel. To detect this, kmdb compares the
943 * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
944 * If it finds that we have single-stepped to a sysenter entry point, it
945 * explicitly clears the flag and executes the sys_sysenter routine.
946 *
947 * One final complication in this final complication is the fact that we
948 * have two different entry points for sysenter: brand_sys_sysenter and
949 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping
950 * through the kernel with kmdb, we will eventually hit the instruction at
951 * sys_sysenter. kmdb cannot distinguish between that valid single-step
952 * and the undesirable one mentioned above. To avoid this situation, we
953 * simply add a jump over the instruction at sys_sysenter to make it
954 * impossible to single-step to it.
955 */
956 #if defined(__lint)
957
958 void
959 sys_sysenter()
960 {}
961
962 #else /* __lint */
963
964 ENTRY_NP(brand_sys_sysenter)
965 SWAPGS /* kernel gsbase */
966 ALTENTRY(_brand_sys_sysenter_post_swapgs)
967 BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
968 /*
969 * Jump over sys_sysenter to allow single-stepping as described
970 * above.
971 */
972 jmp _sys_sysenter_post_swapgs
973
974 ALTENTRY(sys_sysenter)
975 SWAPGS /* kernel gsbase */
976
977 ALTENTRY(_sys_sysenter_post_swapgs)
978 movq %gs:CPU_THREAD, %r15
979
980 movl $U32CS_SEL, REGOFF_CS(%rsp)
981 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */
982 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */
983 pushfq
984 popq %r10
985 movl $UDS_SEL, REGOFF_SS(%rsp)
986
987 /*
988 * Set the interrupt flag before storing the flags to the
989 * flags image on the stack so we can return to user with
990 * interrupts enabled if we return via sys_rtt_syscall32
991 */
992 orq $PS_IE, %r10
993 movq %r10, REGOFF_RFL(%rsp)
994
995 movl %edi, REGOFF_RDI(%rsp)
996 movl %esi, REGOFF_RSI(%rsp)
997 movl %ebp, REGOFF_RBP(%rsp)
998 movl %ebx, REGOFF_RBX(%rsp)
999 movl %edx, REGOFF_RDX(%rsp)
1000 movl %ecx, REGOFF_RCX(%rsp)
1001 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */
1002 movq $0, REGOFF_SAVFP(%rsp)
1004
1005 /*
1006 * Copy these registers here in case we end up stopped with
1007 * someone (like, say, /proc) messing with our register state.
1008 * We don't -restore- them unless we have to in update_sregs.
1009 *
1010 * Since userland -can't- change fsbase or gsbase directly,
1011 * we don't bother to capture them here.
1012 */
1013 xorl %ebx, %ebx
1014 movw %ds, %bx
1015 movq %rbx, REGOFF_DS(%rsp)
1016 movw %es, %bx
1017 movq %rbx, REGOFF_ES(%rsp)
1018 movw %fs, %bx
1019 movq %rbx, REGOFF_FS(%rsp)
1020 movw %gs, %bx
1021 movq %rbx, REGOFF_GS(%rsp)
1022
1023 /*
1024 * Application state saved in the regs structure on the stack
1025 * %eax is the syscall number
1026 * %rsp is the thread's stack, %r15 is curthread
1027 * REG_RSP(%rsp) is the user's stack
1028 */
1029
1030 SYSCALL_TRAPTRACE($TT_SYSENTER)
1031
1032 movq %rsp, %rbp
1033
1034 movq T_LWP(%r15), %r14
1035 ASSERT_NO_RUPDATE_PENDING(%r14)
1036
1037 ENABLE_INTR_FLAGS
1038
1039 /*
1040 * Catch 64-bit process trying to issue sysenter instruction
1041 * on Nocona based systems.
1042 */
1043 movq LWP_PROCP(%r14), %rax
1101 *
1102 * Simulate the same behaviour by unconditionally splitting the
1103 * return value in the same way.
1104 */
1105 movq %rax, %r13
1106 shrq $32, %r13 /* upper 32-bits into %edx */
1107 movl %eax, %r12d /* lower 32-bits into %eax */
1108
1109 /*
1110 * Optimistically assume that there's no post-syscall
1111 * work to do. (This is to avoid having to call syscall_mstate()
1112 * with interrupts disabled)
1113 */
1114 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
1115
1116 /*
1117 * We must protect ourselves from being descheduled here;
1118 * If we were, and we ended up on another cpu, or another
1119 * lwp got int ahead of us, it could change the segment
1120 * registers without us noticing before we return to userland.
1121 */
1122 cli
1123 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
1124 jne _full_syscall_postsys32
1125 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
1126
1127 /*
1128 * To get back to userland, load up the 32-bit registers and
1129 * sysexit back where we came from.
1130 */
1131
1132 /*
1133 * Interrupts will be turned on by the 'sti' executed just before
1134 * sysexit. The following ensures that restoring the user's rflags
1135 * doesn't enable interrupts too soon.
1136 */
1137 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp)
1138
1139 /*
1140 * (There's no point in loading up %edx because the sysexit
1141 * mechanism smashes it.)
1142 */
1143 movl %r12d, %eax
1144 movl REGOFF_RBX(%rsp), %ebx
1145 movl REGOFF_RBP(%rsp), %ebp
1146 movl REGOFF_RSI(%rsp), %esi
1147 movl REGOFF_RDI(%rsp), %edi
1148
1149 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */
1150 pushq REGOFF_RFL(%rsp)
1151 popfq
1152 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */
1153 ALTENTRY(sys_sysenter_swapgs_sysexit)
1154 swapgs
1155 sti
1156 sysexit
1157 SET_SIZE(sys_sysenter_swapgs_sysexit)
1158 SET_SIZE(sys_sysenter)
1159 SET_SIZE(_sys_sysenter_post_swapgs)
1160 SET_SIZE(brand_sys_sysenter)
1161
1162 #endif /* __lint */
1163
1164 /*
1165 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1166 * the generic i386 libc to do system calls. We do a small amount of setup
1167 * before jumping into the existing sys_syscall32 path.
1168 */
1169 #if defined(__lint)
1170
1171 /*ARGSUSED*/
1172 void
1173 sys_syscall_int()
1174 {}
1175
1176 #else /* __lint */
1187 XPV_TRAP_POP
1188 call smap_enable
1189
1190 nopop_syscall_int:
1191 movq %gs:CPU_THREAD, %r15
1192 movq T_STACK(%r15), %rsp
1193 movl %eax, %eax
1194 /*
1195 * Set t_post_sys on this thread to force ourselves out via the slow
1196 * path. It might be possible at some later date to optimize this out
1197 * and use a faster return mechanism.
1198 */
1199 movb $1, T_POST_SYS(%r15)
1200 CLEAN_CS
1201 jmp _syscall32_save
1202 /*
1203 * There should be no instructions between this label and SWAPGS/IRET
1204 * or we could end up breaking branded zone support. See the usage of
1205 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
1206 * for examples.
1207 */
1208 ALTENTRY(sys_sysint_swapgs_iret)
1209 SWAPGS /* user gsbase */
1210 IRET
1211 /*NOTREACHED*/
1212 SET_SIZE(sys_sysint_swapgs_iret)
1213 SET_SIZE(sys_syscall_int)
1214 SET_SIZE(brand_sys_syscall_int)
1215
1216 #endif /* __lint */
1217
1218 /*
1219 * Legacy 32-bit applications and old libc implementations do lcalls;
1220 * we should never get here because the LDT entry containing the syscall
1221 * segment descriptor has the "segment present" bit cleared, which means
1222 * we end up processing those system calls in trap() via a not-present trap.
1223 *
1224 * We do it this way because a call gate unhelpfully does -nothing- to the
1225 * interrupt flag bit, so an interrupt can run us just after the lcall
1226 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and
1227 * INTR_POP paths would have to be slightly more complex to dance around
1228 * this problem, and end up depending explicitly on the first
1229 * instruction of this handler being either swapgs or cli.
1230 */
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 */
26
27 #include <sys/asm_linkage.h>
28 #include <sys/asm_misc.h>
29 #include <sys/regset.h>
30 #include <sys/privregs.h>
31 #include <sys/psw.h>
32 #include <sys/machbrand.h>
33
34 #if defined(__lint)
35
36 #include <sys/types.h>
37 #include <sys/thread.h>
38 #include <sys/systm.h>
39
40 #else /* __lint */
41
42 #include <sys/segments.h>
43 #include <sys/pcb.h>
474 /*
475 * Copy these registers here in case we end up stopped with
476 * someone (like, say, /proc) messing with our register state.
477 * We don't -restore- them unless we have to in update_sregs.
478 *
479 * Since userland -can't- change fsbase or gsbase directly,
480 * and capturing them involves two serializing instructions,
481 * we don't bother to capture them here.
482 */
483 xorl %ebx, %ebx
484 movw %ds, %bx
485 movq %rbx, REGOFF_DS(%rsp)
486 movw %es, %bx
487 movq %rbx, REGOFF_ES(%rsp)
488 movw %fs, %bx
489 movq %rbx, REGOFF_FS(%rsp)
490 movw %gs, %bx
491 movq %rbx, REGOFF_GS(%rsp)
492
493 /*
494 * If we're trying to use TRAPTRACE though, I take that back: we're
495 * probably debugging some problem in the SWAPGS logic and want to know
496 * what the incoming gsbase was.
497 *
498 * Since we already did SWAPGS, record the KGSBASE.
499 */
500 #if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
501 movl $MSR_AMD_KGSBASE, %ecx
502 rdmsr
503 movl %eax, REGOFF_GSBASE(%rsp)
504 movl %edx, REGOFF_GSBASE+4(%rsp)
505 #endif
506
507 /*
508 * Machine state saved in the regs structure on the stack
509 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
510 * %eax is the syscall number
511 * %rsp is the thread's stack, %r15 is curthread
512 * REG_RSP(%rsp) is the user's stack
513 */
514
515 SYSCALL_TRAPTRACE($TT_SYSC64)
516
517 movq %rsp, %rbp
518
519 movq T_LWP(%r15), %r14
520 ASSERT_NO_RUPDATE_PENDING(%r14)
521 ENABLE_INTR_FLAGS
522
523 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
524 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
525
526 ASSERT_LWPTOREGS(%r14, %rsp)
527
668 movq %gs:CPU_THREAD, %r11
669 movq T_STACK(%r11), %rsp
670
671 movq %rcx, REGOFF_RIP(%rsp)
672 movl $UCS_SEL, REGOFF_CS(%rsp)
673 movq %gs:CPU_RTMP_RSP, %r11
674 movq %r11, REGOFF_RSP(%rsp)
675 pushfq
676 popq %r11 /* hypercall enables ints */
677 movq %r11, REGOFF_RFL(%rsp)
678 movl $UDS_SEL, REGOFF_SS(%rsp)
679 addq $REGOFF_RIP, %rsp
680 /*
681 * XXPV: see comment in SYSRETQ definition for future optimization
682 * we could take.
683 */
684 ASSERT_UPCALL_MASK_IS_SET
685 SYSRETQ
686 #else
687 ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
688 jmp tr_sysretq
689 #endif
690 /*NOTREACHED*/
691 SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
692
693 _syscall_pre:
694 call pre_syscall
695 movl %eax, %r12d
696 testl %eax, %eax
697 jne _syscall_post_call
698 /*
699 * Didn't abort, so reload the syscall args and invoke the handler.
700 */
701 movzwl T_SYSNUM(%r15), %eax
702 jmp _syscall_invoke
703
704 _syscall_ill:
705 call nosys
706 movq %rax, %r12
707 movq %rdx, %r13
708 jmp _syscall_post_call
769
770 /*
771 * Copy these registers here in case we end up stopped with
772 * someone (like, say, /proc) messing with our register state.
773 * We don't -restore- them unless we have to in update_sregs.
774 *
775 * Since userland -can't- change fsbase or gsbase directly,
776 * we don't bother to capture them here.
777 */
778 xorl %ebx, %ebx
779 movw %ds, %bx
780 movq %rbx, REGOFF_DS(%rsp)
781 movw %es, %bx
782 movq %rbx, REGOFF_ES(%rsp)
783 movw %fs, %bx
784 movq %rbx, REGOFF_FS(%rsp)
785 movw %gs, %bx
786 movq %rbx, REGOFF_GS(%rsp)
787
788 /*
789 * If we're trying to use TRAPTRACE though, I take that back: we're
790 * probably debugging some problem in the SWAPGS logic and want to know
791 * what the incoming gsbase was.
792 *
793 * Since we already did SWAPGS, record the KGSBASE.
794 */
795 #if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
796 movl $MSR_AMD_KGSBASE, %ecx
797 rdmsr
798 movl %eax, REGOFF_GSBASE(%rsp)
799 movl %edx, REGOFF_GSBASE+4(%rsp)
800 #endif
801
802 /*
803 * Application state saved in the regs structure on the stack
804 * %eax is the syscall number
805 * %rsp is the thread's stack, %r15 is curthread
806 * REG_RSP(%rsp) is the user's stack
807 */
808
809 SYSCALL_TRAPTRACE32($TT_SYSC)
810
811 movq %rsp, %rbp
812
813 movq T_LWP(%r15), %r14
814 ASSERT_NO_RUPDATE_PENDING(%r14)
815
816 ENABLE_INTR_FLAGS
817
818 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
819 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
820
821 ASSERT_LWPTOREGS(%r14, %rsp)
822
899 /*
900 * To get back to userland, we need to put the return %rip in %rcx and
901 * the return %rfl in %r11d. The sysret instruction also arranges
902 * to fix up %cs and %ss; everything else is our responsibility.
903 */
904
905 movl %r12d, %eax /* %eax: rval1 */
906 movl REGOFF_RBX(%rsp), %ebx
907 /* %ecx used for return pointer */
908 movl %r13d, %edx /* %edx: rval2 */
909 movl REGOFF_RBP(%rsp), %ebp
910 movl REGOFF_RSI(%rsp), %esi
911 movl REGOFF_RDI(%rsp), %edi
912
913 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */
914 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */
915 movl REGOFF_RSP(%rsp), %esp
916
917 ASSERT_UPCALL_MASK_IS_SET
918 ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
919 jmp tr_sysretl
920 SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
921 /*NOTREACHED*/
922
923 _full_syscall_postsys32:
924 STI
925 /*
926 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
927 * so that we can account for the extra work it takes us to finish.
928 */
929 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
930 movq %r15, %rdi
931 movq %r12, %rsi /* rval1 - %eax */
932 movq %r13, %rdx /* rval2 - %edx */
933 call syscall_exit
934 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
935 jmp _sys_rtt
936 SET_SIZE(sys_syscall32)
937 SET_SIZE(brand_sys_syscall32)
938
939 #endif /* __lint */
944 *
945 * The caller in userland has arranged that:
946 *
947 * - %eax contains the syscall number
948 * - %ecx contains the user %esp
949 * - %edx contains the return %eip
950 * - the user stack contains the args to the syscall
951 *
952 * Hardware and (privileged) initialization code have arranged that by
953 * the time the sysenter instructions completes:
954 *
955 * - %rip is pointing to sys_sysenter (below).
956 * - %cs and %ss are set to kernel text and stack (data) selectors.
957 * - %rsp is pointing at the lwp's stack
958 * - interrupts have been disabled.
959 *
960 * Note that we are unable to return both "rvals" to userland with
961 * this call, as %edx is used by the sysexit instruction.
962 *
963 * One final complication in this routine is its interaction with
964 * single-stepping in a debugger. For most of the system call mechanisms, the
965 * CPU automatically clears the single-step flag before we enter the kernel.
966 * The sysenter mechanism does not clear the flag, so a user single-stepping
967 * through a libc routine may suddenly find themself single-stepping through the
968 * kernel. To detect this, kmdb and trap() both compare the trap %pc to the
969 * [brand_]sys_enter addresses on each single-step trap. If it finds that we
970 * have single-stepped to a sysenter entry point, it explicitly clears the flag
971 * and executes the sys_sysenter routine.
972 *
973 * One final complication in this final complication is the fact that we have
974 * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
975 * If we enter at brand_sys_sysenter and start single-stepping through the
976 * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
977 * kmdb cannot distinguish between that valid single-step and the undesirable
978 * one mentioned above. To avoid this situation, we simply add a jump over the
979 * instruction at sys_sysenter to make it impossible to single-step to it.
980 */
981 #if defined(__lint)
982
983 void
984 sys_sysenter()
985 {}
986
987 #else /* __lint */
988
989 ENTRY_NP(brand_sys_sysenter)
990 SWAPGS /* kernel gsbase */
991 ALTENTRY(_brand_sys_sysenter_post_swapgs)
992
993 BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
994 /*
995 * Jump over sys_sysenter to allow single-stepping as described
996 * above.
997 */
998 jmp _sys_sysenter_post_swapgs
999
1000 ALTENTRY(sys_sysenter)
1001 SWAPGS /* kernel gsbase */
1002 ALTENTRY(_sys_sysenter_post_swapgs)
1003
1004 movq %gs:CPU_THREAD, %r15
1005
1006 movl $U32CS_SEL, REGOFF_CS(%rsp)
1007 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */
1008 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */
1009 /*
1010 * NOTE: none of the instructions that run before we get here should
1011 * clobber bits in (R)FLAGS! This includes the kpti trampoline.
1012 */
1013 pushfq
1014 popq %r10
1015 movl $UDS_SEL, REGOFF_SS(%rsp)
1016
1017 /*
1018 * Set the interrupt flag before storing the flags to the
1019 * flags image on the stack so we can return to user with
1020 * interrupts enabled if we return via sys_rtt_syscall32
1021 */
1022 orq $PS_IE, %r10
1023 movq %r10, REGOFF_RFL(%rsp)
1024
1025 movl %edi, REGOFF_RDI(%rsp)
1026 movl %esi, REGOFF_RSI(%rsp)
1027 movl %ebp, REGOFF_RBP(%rsp)
1028 movl %ebx, REGOFF_RBX(%rsp)
1029 movl %edx, REGOFF_RDX(%rsp)
1030 movl %ecx, REGOFF_RCX(%rsp)
1031 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */
1032 movq $0, REGOFF_SAVFP(%rsp)
1034
1035 /*
1036 * Copy these registers here in case we end up stopped with
1037 * someone (like, say, /proc) messing with our register state.
1038 * We don't -restore- them unless we have to in update_sregs.
1039 *
1040 * Since userland -can't- change fsbase or gsbase directly,
1041 * we don't bother to capture them here.
1042 */
1043 xorl %ebx, %ebx
1044 movw %ds, %bx
1045 movq %rbx, REGOFF_DS(%rsp)
1046 movw %es, %bx
1047 movq %rbx, REGOFF_ES(%rsp)
1048 movw %fs, %bx
1049 movq %rbx, REGOFF_FS(%rsp)
1050 movw %gs, %bx
1051 movq %rbx, REGOFF_GS(%rsp)
1052
1053 /*
1054 * If we're trying to use TRAPTRACE though, I take that back: we're
1055 * probably debugging some problem in the SWAPGS logic and want to know
1056 * what the incoming gsbase was.
1057 *
1058 * Since we already did SWAPGS, record the KGSBASE.
1059 */
1060 #if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
1061 movl $MSR_AMD_KGSBASE, %ecx
1062 rdmsr
1063 movl %eax, REGOFF_GSBASE(%rsp)
1064 movl %edx, REGOFF_GSBASE+4(%rsp)
1065 #endif
1066
1067 /*
1068 * Application state saved in the regs structure on the stack
1069 * %eax is the syscall number
1070 * %rsp is the thread's stack, %r15 is curthread
1071 * REG_RSP(%rsp) is the user's stack
1072 */
1073
1074 SYSCALL_TRAPTRACE($TT_SYSENTER)
1075
1076 movq %rsp, %rbp
1077
1078 movq T_LWP(%r15), %r14
1079 ASSERT_NO_RUPDATE_PENDING(%r14)
1080
1081 ENABLE_INTR_FLAGS
1082
1083 /*
1084 * Catch 64-bit process trying to issue sysenter instruction
1085 * on Nocona based systems.
1086 */
1087 movq LWP_PROCP(%r14), %rax
1145 *
1146 * Simulate the same behaviour by unconditionally splitting the
1147 * return value in the same way.
1148 */
1149 movq %rax, %r13
1150 shrq $32, %r13 /* upper 32-bits into %edx */
1151 movl %eax, %r12d /* lower 32-bits into %eax */
1152
1153 /*
1154 * Optimistically assume that there's no post-syscall
1155 * work to do. (This is to avoid having to call syscall_mstate()
1156 * with interrupts disabled)
1157 */
1158 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
1159
1160 /*
1161 * We must protect ourselves from being descheduled here;
1162 * If we were, and we ended up on another cpu, or another
1163 * lwp got int ahead of us, it could change the segment
1164 * registers without us noticing before we return to userland.
1165 *
1166 * This cli is undone in the tr_sysexit trampoline code.
1167 */
1168 cli
1169 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
1170 jne _full_syscall_postsys32
1171 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
1172
1173 /*
1174 * To get back to userland, load up the 32-bit registers and
1175 * sysexit back where we came from.
1176 */
1177
1178 /*
1179 * Interrupts will be turned on by the 'sti' executed just before
1180 * sysexit. The following ensures that restoring the user's rflags
1181 * doesn't enable interrupts too soon.
1182 */
1183 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp)
1184
1185 /*
1186 * (There's no point in loading up %edx because the sysexit
1187 * mechanism smashes it.)
1188 */
1189 movl %r12d, %eax
1190 movl REGOFF_RBX(%rsp), %ebx
1191 movl REGOFF_RBP(%rsp), %ebp
1192 movl REGOFF_RSI(%rsp), %esi
1193 movl REGOFF_RDI(%rsp), %edi
1194
1195 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */
1196 pushq REGOFF_RFL(%rsp)
1197 popfq
1198 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */
1199 ALTENTRY(sys_sysenter_swapgs_sysexit)
1200 jmp tr_sysexit
1201 SET_SIZE(sys_sysenter_swapgs_sysexit)
1202 SET_SIZE(sys_sysenter)
1203 SET_SIZE(_sys_sysenter_post_swapgs)
1204 SET_SIZE(brand_sys_sysenter)
1205
1206 #endif /* __lint */
1207
1208 /*
1209 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1210 * the generic i386 libc to do system calls. We do a small amount of setup
1211 * before jumping into the existing sys_syscall32 path.
1212 */
1213 #if defined(__lint)
1214
1215 /*ARGSUSED*/
1216 void
1217 sys_syscall_int()
1218 {}
1219
1220 #else /* __lint */
1231 XPV_TRAP_POP
1232 call smap_enable
1233
1234 nopop_syscall_int:
1235 movq %gs:CPU_THREAD, %r15
1236 movq T_STACK(%r15), %rsp
1237 movl %eax, %eax
1238 /*
1239 * Set t_post_sys on this thread to force ourselves out via the slow
1240 * path. It might be possible at some later date to optimize this out
1241 * and use a faster return mechanism.
1242 */
1243 movb $1, T_POST_SYS(%r15)
1244 CLEAN_CS
1245 jmp _syscall32_save
1246 /*
1247 * There should be no instructions between this label and SWAPGS/IRET
1248 * or we could end up breaking branded zone support. See the usage of
1249 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
1250 * for examples.
1251 *
1252 * We want to swapgs to maintain the invariant that all entries into
1253 * tr_iret_user are done on the user gsbase.
1254 */
1255 ALTENTRY(sys_sysint_swapgs_iret)
1256 SWAPGS
1257 jmp tr_iret_user
1258 /*NOTREACHED*/
1259 SET_SIZE(sys_sysint_swapgs_iret)
1260 SET_SIZE(sys_syscall_int)
1261 SET_SIZE(brand_sys_syscall_int)
1262
1263 #endif /* __lint */
1264
1265 /*
1266 * Legacy 32-bit applications and old libc implementations do lcalls;
1267 * we should never get here because the LDT entry containing the syscall
1268 * segment descriptor has the "segment present" bit cleared, which means
1269 * we end up processing those system calls in trap() via a not-present trap.
1270 *
1271 * We do it this way because a call gate unhelpfully does -nothing- to the
1272 * interrupt flag bit, so an interrupt can run us just after the lcall
1273 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and
1274 * INTR_POP paths would have to be slightly more complex to dance around
1275 * this problem, and end up depending explicitly on the first
1276 * instruction of this handler being either swapgs or cli.
1277 */
|