Print this page
11787 Kernel needs to be built with retpolines
11788 Kernel needs to generally use RSB stuffing
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ new/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2019 Joyent, Inc.
24 24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 25 */
26 26
27 27 #include <sys/asm_linkage.h>
28 28 #include <sys/asm_misc.h>
29 29 #include <sys/regset.h>
30 30 #include <sys/privregs.h>
31 31 #include <sys/psw.h>
32 32 #include <sys/machbrand.h>
33 33
34 34 #if defined(__lint)
35 35
36 36 #include <sys/types.h>
37 37 #include <sys/thread.h>
38 38 #include <sys/systm.h>
39 39
40 40 #else /* __lint */
41 41
42 42 #include <sys/segments.h>
43 43 #include <sys/pcb.h>
44 44 #include <sys/trap.h>
45 45 #include <sys/ftrace.h>
46 46 #include <sys/traptrace.h>
47 47 #include <sys/clock.h>
48 48 #include <sys/model.h>
49 49 #include <sys/panic.h>
50 50
51 51 #if defined(__xpv)
52 52 #include <sys/hypervisor.h>
53 53 #endif
54 54
55 55 #include "assym.h"
56 56
57 57 #endif /* __lint */
58 58
59 59 /*
60 60 * We implement five flavours of system call entry points
61 61 *
62 62 * - syscall/sysretq (amd64 generic)
63 63 * - syscall/sysretl (i386 plus SYSC bit)
64 64 * - sysenter/sysexit (i386 plus SEP bit)
65 65 * - int/iret (i386 generic)
66 66 * - lcall/iret (i386 generic)
67 67 *
68 68 * The current libc included in Solaris uses int/iret as the base unoptimized
69 69 * kernel entry method. Older libc implementations and legacy binaries may use
70 70 * the lcall call gate, so it must continue to be supported.
71 71 *
72 72 * System calls that use an lcall call gate are processed in trap() via a
73 73 * segment-not-present trap, i.e. lcalls are extremely slow(!).
74 74 *
75 75 * The basic pattern used in the 32-bit SYSC handler at this point in time is
76 76 * to have the bare minimum of assembler, and get to the C handlers as
77 77 * quickly as possible.
78 78 *
79 79 * The 64-bit handler is much closer to the sparcv9 handler; that's
80 80 * because of passing arguments in registers. The 32-bit world still
81 81 * passes arguments on the stack -- that makes that handler substantially
82 82 * more complex.
83 83 *
84 84 * The two handlers share a few code fragments which are broken
85 85 * out into preprocessor macros below.
86 86 *
87 87 * XX64 come back and speed all this up later. The 32-bit stuff looks
88 88 * especially easy to speed up the argument copying part ..
89 89 *
90 90 *
91 91 * Notes about segment register usage (c.f. the 32-bit kernel)
92 92 *
93 93 * In the 32-bit kernel, segment registers are dutifully saved and
94 94 * restored on all mode transitions because the kernel uses them directly.
95 95 * When the processor is running in 64-bit mode, segment registers are
96 96 * largely ignored.
97 97 *
98 98 * %cs and %ss
99 99 * controlled by the hardware mechanisms that make mode transitions
100 100 *
101 101 * The remaining segment registers have to either be pointing at a valid
102 102 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
103 103 *
104 104 * %ds and %es
105 105 * always ignored
106 106 *
107 107 * %fs and %gs
108 108 * fsbase and gsbase are used to control the place they really point at.
109 109 * The kernel only depends on %gs, and controls its own gsbase via swapgs
110 110 *
111 111 * Note that loading segment registers is still costly because the GDT
112 112 * lookup still happens (this is because the hardware can't know that we're
113 113 * not setting up these segment registers for a 32-bit program). Thus we
114 114 * avoid doing this in the syscall path, and defer them to lwp context switch
115 115 * handlers, so the register values remain virtualized to the lwp.
116 116 */
117 117
118 118 #if defined(SYSCALLTRACE)
119 119 #define ORL_SYSCALLTRACE(r32) \
120 120 orl syscalltrace(%rip), r32
121 121 #else
122 122 #define ORL_SYSCALLTRACE(r32)
123 123 #endif
124 124
125 125 /*
126 126 * In the 32-bit kernel, we do absolutely nothing before getting into the
127 127 * brand callback checks. In 64-bit land, we do swapgs and then come here.
128 128 * We assume that the %rsp- and %r15-stashing fields in the CPU structure
129 129 * are still unused.
130 130 *
131 131 * Check if a brand_mach_ops callback is defined for the specified callback_id
132 132 * type. If so invoke it with the kernel's %gs value loaded and the following
133 133 * data on the stack:
134 134 *
135 135 * stack: --------------------------------------
136 136 * 32 | callback pointer |
137 137 * | 24 | user (or interrupt) stack pointer |
138 138 * | 16 | lwp pointer |
139 139 * v 8 | userland return address |
140 140 * 0 | callback wrapper return addr |
141 141 * --------------------------------------
142 142 *
143 143 * Since we're pushing the userland return address onto the kernel stack
144 144 * we need to get that address without accessing the user's stack (since we
145 145 * can't trust that data). There are different ways to get the userland
146 146 * return address depending on how the syscall trap was made:
147 147 *
148 148 * a) For sys_syscall and sys_syscall32 the return address is in %rcx.
149 149 * b) For sys_sysenter the return address is in %rdx.
150 150 * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro,
151 151 * the stack pointer points at the state saved when we took the interrupt:
152 152 * ------------------------
153 153 * | | user's %ss |
154 154 * | | user's %esp |
155 155 * | | EFLAGS register |
156 156 * v | user's %cs |
157 157 * | user's %eip |
158 158 * ------------------------
159 159 *
160 160 * The 2nd parameter to the BRAND_CALLBACK macro is either the
161 161 * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro. These macros are
162 162 * used to generate the proper code to get the userland return address for
163 163 * each syscall entry point.
164 164 *
165 165 * The interface to the brand callbacks on the 64-bit kernel assumes %r15
166 166 * is available as a scratch register within the callback. If the callback
167 167 * returns within the kernel then this macro will restore %r15. If the
168 168 * callback is going to return directly to userland then it should restore
169 169 * %r15 before returning to userland.
170 170 */
171 171 #define BRAND_URET_FROM_REG(rip_reg) \
172 172 pushq rip_reg /* push the return address */
173 173
174 174 /*
175 175 * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro
176 176 * is currently pointing at the user return address (%eip).
177 177 */
178 178 #define BRAND_URET_FROM_INTR_STACK() \
179 179 movq %gs:CPU_RTMP_RSP, %r15 /* grab the intr. stack pointer */ ;\
180 180 pushq (%r15) /* push the return address */
181 181
182 182 #define BRAND_CALLBACK(callback_id, push_userland_ret) \
183 183 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\
184 184 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\
185 185 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\
186 186 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\
187 187 subq $16, %rsp /* save space for 2 pointers */ ;\
188 188 pushq %r14 /* save %r14 */ ;\
189 189 movq %gs:CPU_RTMP_RSP, %r14 ;\
190 190 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\
191 191 popq %r14 /* restore %r14 */ ;\
↓ open down ↓ |
191 lines elided |
↑ open up ↑ |
192 192 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\
193 193 pushq %r15 /* push the lwp pointer */ ;\
194 194 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\
195 195 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\
196 196 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\
197 197 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\
198 198 cmpq $0, %r15 ;\
199 199 je 1f ;\
200 200 movq %r15, 16(%rsp) /* save the callback pointer */ ;\
201 201 push_userland_ret /* push the return address */ ;\
202 - call *24(%rsp) /* call callback */ ;\
202 + movq 24(%rsp), %r15 /* load callback pointer */ ;\
203 + INDIRECT_CALL_REG(r15) /* call callback */ ;\
203 204 1: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\
204 205 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */
205 206
206 207 #define MSTATE_TRANSITION(from, to) \
207 208 movl $from, %edi; \
208 209 movl $to, %esi; \
209 210 call syscall_mstate
210 211
211 212 /*
212 213 * Check to see if a simple (direct) return is possible i.e.
213 214 *
214 215 * if (t->t_post_sys_ast | syscalltrace |
215 216 * lwp->lwp_pcb.pcb_rupdate == 1)
216 217 * do full version ;
217 218 *
218 219 * Preconditions:
219 220 * - t is curthread
220 221 * Postconditions:
221 222 * - condition code NE is set if post-sys is too complex
222 223 * - rtmp is zeroed if it isn't (we rely on this!)
223 224 * - ltmp is smashed
224 225 */
225 226 #define CHECK_POSTSYS_NE(t, ltmp, rtmp) \
226 227 movq T_LWP(t), ltmp; \
227 228 movzbl PCB_RUPDATE(ltmp), rtmp; \
228 229 ORL_SYSCALLTRACE(rtmp); \
229 230 orl T_POST_SYS_AST(t), rtmp; \
230 231 cmpl $0, rtmp
231 232
232 233 /*
233 234 * Fix up the lwp, thread, and eflags for a successful return
234 235 *
235 236 * Preconditions:
236 237 * - zwreg contains zero
237 238 */
238 239 #define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \
239 240 movb $LWP_USER, LWP_STATE(lwp); \
240 241 movw zwreg, T_SYSNUM(t); \
241 242 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
242 243
243 244 /*
244 245 * ASSERT(lwptoregs(lwp) == rp);
245 246 *
246 247 * This may seem obvious, but very odd things happen if this
247 248 * assertion is false
248 249 *
249 250 * Preconditions:
250 251 * (%rsp is ready for normal call sequence)
251 252 * Postconditions (if assertion is true):
252 253 * %r11 is smashed
253 254 *
254 255 * ASSERT(rp->r_cs == descnum)
255 256 *
256 257 * The code selector is written into the regs structure when the
257 258 * lwp stack is created. We use this ASSERT to validate that
258 259 * the regs structure really matches how we came in.
259 260 *
260 261 * Preconditions:
261 262 * (%rsp is ready for normal call sequence)
262 263 * Postconditions (if assertion is true):
263 264 * -none-
264 265 *
265 266 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
266 267 *
267 268 * If this is false, it meant that we returned to userland without
268 269 * updating the segment registers as we were supposed to.
269 270 *
270 271 * Note that we must ensure no interrupts or other traps intervene
271 272 * between entering privileged mode and performing the assertion,
272 273 * otherwise we may perform a context switch on the thread, which
273 274 * will end up setting pcb_rupdate to 1 again.
274 275 *
275 276 * ASSERT(%cr0 & CR0_TS == 0);
276 277 * Preconditions:
277 278 * (%rsp is ready for normal call sequence)
278 279 * Postconditions (if assertion is true):
279 280 * (specified register is clobbered)
280 281 *
281 282 * Check to make sure that we are returning to user land and that CR0.TS
282 283 * is not set. This is required as part of the eager FPU (see
283 284 * uts/intel/ia32/os/fpu.c for more information).
284 285 */
285 286
286 287 #if defined(DEBUG)
287 288
288 289 #if !defined(__lint)
289 290
290 291 __lwptoregs_msg:
291 292 .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
292 293
293 294 __codesel_msg:
294 295 .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
295 296
296 297 __no_rupdate_msg:
297 298 .string "syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0"
298 299
299 300 __bad_ts_msg:
300 301 .string "sysscall_asm_amd64.s:%d CR0.TS set on user return"
301 302
302 303 #endif /* !__lint */
303 304
304 305 #define ASSERT_LWPTOREGS(lwp, rp) \
305 306 movq LWP_REGS(lwp), %r11; \
306 307 cmpq rp, %r11; \
307 308 je 7f; \
308 309 leaq __lwptoregs_msg(%rip), %rdi; \
309 310 movl $__LINE__, %esi; \
310 311 movq lwp, %rdx; \
311 312 movq %r11, %rcx; \
312 313 movq rp, %r8; \
313 314 xorl %eax, %eax; \
314 315 call panic; \
315 316 7:
316 317
317 318 #define ASSERT_NO_RUPDATE_PENDING(lwp) \
318 319 testb $0x1, PCB_RUPDATE(lwp); \
319 320 je 8f; \
320 321 movq lwp, %rdx; \
321 322 leaq __no_rupdate_msg(%rip), %rdi; \
322 323 movl $__LINE__, %esi; \
323 324 xorl %eax, %eax; \
324 325 call panic; \
325 326 8:
326 327
327 328 #define ASSERT_CR0TS_ZERO(reg) \
328 329 movq %cr0, reg; \
329 330 testq $CR0_TS, reg; \
330 331 jz 9f; \
331 332 leaq __bad_ts_msg(%rip), %rdi; \
332 333 movl $__LINE__, %esi; \
333 334 xorl %eax, %eax; \
334 335 call panic; \
335 336 9:
336 337
337 338 #else
338 339 #define ASSERT_LWPTOREGS(lwp, rp)
339 340 #define ASSERT_NO_RUPDATE_PENDING(lwp)
340 341 #define ASSERT_CR0TS_ZERO(reg)
341 342 #endif
342 343
343 344 /*
344 345 * Do the traptrace thing and restore any registers we used
345 346 * in situ. Assumes that %rsp is pointing at the base of
346 347 * the struct regs, obviously ..
347 348 */
348 349 #ifdef TRAPTRACE
349 350 #define SYSCALL_TRAPTRACE(ttype) \
350 351 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \
351 352 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \
352 353 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \
353 354 movq REGOFF_RAX(%rsp), %rax; \
354 355 movq REGOFF_RBX(%rsp), %rbx; \
355 356 movq REGOFF_RCX(%rsp), %rcx; \
356 357 movq REGOFF_RDX(%rsp), %rdx; \
357 358 movl %eax, TTR_SYSNUM(%rdi); \
358 359 movq REGOFF_RDI(%rsp), %rdi
359 360
360 361 #define SYSCALL_TRAPTRACE32(ttype) \
361 362 SYSCALL_TRAPTRACE(ttype); \
362 363 /* paranoia: clean the top 32-bits of the registers */ \
363 364 orl %eax, %eax; \
364 365 orl %ebx, %ebx; \
365 366 orl %ecx, %ecx; \
366 367 orl %edx, %edx; \
367 368 orl %edi, %edi
368 369 #else /* TRAPTRACE */
369 370 #define SYSCALL_TRAPTRACE(ttype)
370 371 #define SYSCALL_TRAPTRACE32(ttype)
371 372 #endif /* TRAPTRACE */
372 373
373 374 /*
374 375 * The 64-bit libc syscall wrapper does this:
375 376 *
376 377 * fn(<args>)
377 378 * {
378 379 * movq %rcx, %r10 -- because syscall smashes %rcx
379 380 * movl $CODE, %eax
380 381 * syscall
381 382 * <error processing>
382 383 * }
383 384 *
384 385 * Thus when we come into the kernel:
385 386 *
386 387 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
387 388 * %rax is the syscall number
388 389 * %r12-%r15 contain caller state
389 390 *
390 391 * The syscall instruction arranges that:
391 392 *
392 393 * %rcx contains the return %rip
393 394 * %r11d contains bottom 32-bits of %rflags
394 395 * %rflags is masked (as determined by the SFMASK msr)
395 396 * %cs is set to UCS_SEL (as determined by the STAR msr)
396 397 * %ss is set to UDS_SEL (as determined by the STAR msr)
397 398 * %rip is set to sys_syscall (as determined by the LSTAR msr)
398 399 *
399 400 * Or in other words, we have no registers available at all.
400 401 * Only swapgs can save us!
401 402 *
402 403 * Under the hypervisor, the swapgs has happened already. However, the
403 404 * state of the world is very different from that we're familiar with.
404 405 *
405 406 * In particular, we have a stack structure like that for interrupt
406 407 * gates, except that the %cs and %ss registers are modified for reasons
407 408 * that are not entirely clear. Critically, the %rcx/%r11 values do
408 409 * *not* reflect the usage of those registers under a 'real' syscall[1];
409 410 * the stack, therefore, looks like this:
410 411 *
411 412 * 0x0(rsp) potentially junk %rcx
412 413 * 0x8(rsp) potentially junk %r11
413 414 * 0x10(rsp) user %rip
414 415 * 0x18(rsp) modified %cs
415 416 * 0x20(rsp) user %rflags
416 417 * 0x28(rsp) user %rsp
417 418 * 0x30(rsp) modified %ss
418 419 *
419 420 *
420 421 * and before continuing on, we must load the %rip into %rcx and the
421 422 * %rflags into %r11.
422 423 *
423 424 * [1] They used to, and we relied on it, but this was broken in 3.1.1.
424 425 * Sigh.
425 426 */
426 427 #if defined(__xpv)
427 428 #define XPV_SYSCALL_PROD \
428 429 movq 0x10(%rsp), %rcx; \
429 430 movq 0x20(%rsp), %r11; \
430 431 movq 0x28(%rsp), %rsp
431 432 #else
432 433 #define XPV_SYSCALL_PROD /* nothing */
433 434 #endif
434 435
435 436 #if defined(__lint)
436 437
437 438 /*ARGSUSED*/
438 439 void
439 440 sys_syscall()
440 441 {}
441 442
442 443 void
443 444 _allsyscalls()
444 445 {}
445 446
446 447 size_t _allsyscalls_size;
447 448
448 449 #else /* __lint */
449 450
450 451 ENTRY_NP2(brand_sys_syscall,_allsyscalls)
451 452 SWAPGS /* kernel gsbase */
452 453 XPV_SYSCALL_PROD
453 454 BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx))
454 455 jmp noprod_sys_syscall
455 456
456 457 ALTENTRY(sys_syscall)
457 458 SWAPGS /* kernel gsbase */
458 459 XPV_SYSCALL_PROD
459 460
460 461 noprod_sys_syscall:
461 462 movq %r15, %gs:CPU_RTMP_R15
462 463 movq %rsp, %gs:CPU_RTMP_RSP
463 464
464 465 movq %gs:CPU_THREAD, %r15
465 466 movq T_STACK(%r15), %rsp /* switch from user to kernel stack */
466 467
467 468 ASSERT_UPCALL_MASK_IS_SET
468 469
469 470 movl $UCS_SEL, REGOFF_CS(%rsp)
470 471 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */
471 472 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */
472 473 movl $UDS_SEL, REGOFF_SS(%rsp)
473 474
474 475 movl %eax, %eax /* wrapper: sysc# -> %eax */
475 476 movq %rdi, REGOFF_RDI(%rsp)
476 477 movq %rsi, REGOFF_RSI(%rsp)
477 478 movq %rdx, REGOFF_RDX(%rsp)
478 479 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */
479 480 movq %r10, %rcx /* arg[3] for direct calls */
480 481
481 482 movq %r8, REGOFF_R8(%rsp)
482 483 movq %r9, REGOFF_R9(%rsp)
483 484 movq %rax, REGOFF_RAX(%rsp)
484 485 movq %rbx, REGOFF_RBX(%rsp)
485 486
486 487 movq %rbp, REGOFF_RBP(%rsp)
487 488 movq %r10, REGOFF_R10(%rsp)
488 489 movq %gs:CPU_RTMP_RSP, %r11
489 490 movq %r11, REGOFF_RSP(%rsp)
490 491 movq %r12, REGOFF_R12(%rsp)
491 492
492 493 movq %r13, REGOFF_R13(%rsp)
493 494 movq %r14, REGOFF_R14(%rsp)
494 495 movq %gs:CPU_RTMP_R15, %r10
495 496 movq %r10, REGOFF_R15(%rsp)
496 497 movq $0, REGOFF_SAVFP(%rsp)
497 498 movq $0, REGOFF_SAVPC(%rsp)
498 499
499 500 /*
500 501 * Copy these registers here in case we end up stopped with
501 502 * someone (like, say, /proc) messing with our register state.
502 503 * We don't -restore- them unless we have to in update_sregs.
503 504 *
504 505 * Since userland -can't- change fsbase or gsbase directly,
505 506 * and capturing them involves two serializing instructions,
506 507 * we don't bother to capture them here.
507 508 */
508 509 xorl %ebx, %ebx
509 510 movw %ds, %bx
510 511 movq %rbx, REGOFF_DS(%rsp)
511 512 movw %es, %bx
512 513 movq %rbx, REGOFF_ES(%rsp)
513 514 movw %fs, %bx
514 515 movq %rbx, REGOFF_FS(%rsp)
515 516 movw %gs, %bx
516 517 movq %rbx, REGOFF_GS(%rsp)
517 518
518 519 /*
519 520 * If we're trying to use TRAPTRACE though, I take that back: we're
520 521 * probably debugging some problem in the SWAPGS logic and want to know
521 522 * what the incoming gsbase was.
522 523 *
523 524 * Since we already did SWAPGS, record the KGSBASE.
524 525 */
525 526 #if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
526 527 movl $MSR_AMD_KGSBASE, %ecx
527 528 rdmsr
528 529 movl %eax, REGOFF_GSBASE(%rsp)
529 530 movl %edx, REGOFF_GSBASE+4(%rsp)
530 531 #endif
531 532
532 533 /*
533 534 * Machine state saved in the regs structure on the stack
534 535 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
535 536 * %eax is the syscall number
536 537 * %rsp is the thread's stack, %r15 is curthread
537 538 * REG_RSP(%rsp) is the user's stack
538 539 */
539 540
540 541 SYSCALL_TRAPTRACE($TT_SYSC64)
541 542
542 543 movq %rsp, %rbp
543 544
544 545 movq T_LWP(%r15), %r14
545 546 ASSERT_NO_RUPDATE_PENDING(%r14)
546 547 ENABLE_INTR_FLAGS
547 548
548 549 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
549 550 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
550 551
551 552 ASSERT_LWPTOREGS(%r14, %rsp)
552 553
553 554 movb $LWP_SYS, LWP_STATE(%r14)
554 555 incq LWP_RU_SYSC(%r14)
555 556 movb $NORMALRETURN, LWP_EOSYS(%r14)
556 557
557 558 incq %gs:CPU_STATS_SYS_SYSCALL
558 559
559 560 movw %ax, T_SYSNUM(%r15)
560 561 movzbl T_PRE_SYS(%r15), %ebx
561 562 ORL_SYSCALLTRACE(%ebx)
562 563 testl %ebx, %ebx
563 564 jne _syscall_pre
564 565
565 566 _syscall_invoke:
566 567 movq REGOFF_RDI(%rbp), %rdi
567 568 movq REGOFF_RSI(%rbp), %rsi
↓ open down ↓ |
355 lines elided |
↑ open up ↑ |
568 569 movq REGOFF_RDX(%rbp), %rdx
569 570 movq REGOFF_RCX(%rbp), %rcx
570 571 movq REGOFF_R8(%rbp), %r8
571 572 movq REGOFF_R9(%rbp), %r9
572 573
573 574 cmpl $NSYSCALL, %eax
574 575 jae _syscall_ill
575 576 shll $SYSENT_SIZE_SHIFT, %eax
576 577 leaq sysent(%rax), %rbx
577 578
578 - call *SY_CALLC(%rbx)
579 + movq SY_CALLC(%rbx), %rax
580 + INDIRECT_CALL_REG(rax)
579 581
580 582 movq %rax, %r12
581 583 movq %rdx, %r13
582 584
583 585 /*
584 586 * If the handler returns two ints, then we need to split the
585 587 * 64-bit return value into two 32-bit values.
586 588 */
587 589 testw $SE_32RVAL2, SY_FLAGS(%rbx)
588 590 je 5f
589 591 movq %r12, %r13
590 592 shrq $32, %r13 /* upper 32-bits into %edx */
591 593 movl %r12d, %r12d /* lower 32-bits into %eax */
592 594 5:
593 595 /*
594 596 * Optimistically assume that there's no post-syscall
595 597 * work to do. (This is to avoid having to call syscall_mstate()
596 598 * with interrupts disabled)
597 599 */
598 600 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
599 601
600 602 /*
601 603 * We must protect ourselves from being descheduled here;
602 604 * If we were, and we ended up on another cpu, or another
603 605 * lwp got in ahead of us, it could change the segment
604 606 * registers without us noticing before we return to userland.
605 607 */
606 608 CLI(%r14)
607 609 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
608 610 jne _syscall_post
609 611
610 612 /*
611 613 * We need to protect ourselves against non-canonical return values
612 614 * because Intel doesn't check for them on sysret (AMD does). Canonical
613 615 * addresses on current amd64 processors only use 48-bits for VAs; an
614 616 * address is canonical if all upper bits (47-63) are identical. If we
615 617 * find a non-canonical %rip, we opt to go through the full
616 618 * _syscall_post path which takes us into an iretq which is not
617 619 * susceptible to the same problems sysret is.
618 620 *
619 621 * We're checking for a canonical address by first doing an arithmetic
620 622 * shift. This will fill in the remaining bits with the value of bit 63.
621 623 * If the address were canonical, the register would now have either all
622 624 * zeroes or all ones in it. Therefore we add one (inducing overflow)
623 625 * and compare against 1. A canonical address will either be zero or one
624 626 * at this point, hence the use of ja.
625 627 *
626 628 * At this point, r12 and r13 have the return value so we can't use
627 629 * those registers.
628 630 */
629 631 movq REGOFF_RIP(%rsp), %rcx
630 632 sarq $47, %rcx
631 633 incq %rcx
632 634 cmpq $1, %rcx
633 635 ja _syscall_post
634 636
635 637
636 638 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
637 639
638 640 movq %r12, REGOFF_RAX(%rsp)
639 641 movq %r13, REGOFF_RDX(%rsp)
640 642
641 643 /*
642 644 * Clobber %r11 as we check CR0.TS.
643 645 */
↓ open down ↓ |
55 lines elided |
↑ open up ↑ |
644 646 ASSERT_CR0TS_ZERO(%r11)
645 647
646 648 /*
647 649 * Unlike other cases, because we need to restore the user stack pointer
648 650 * before exiting the kernel we must clear the microarch state before
649 651 * getting here. This should be safe because it means that the only
650 652 * values on the bus after this are based on the user's registers and
651 653 * potentially the addresses where we stored them. Given the constraints
652 654 * of sysret, that's how it has to be.
653 655 */
654 - call *x86_md_clear
656 + call x86_md_clear
655 657
656 658 /*
657 659 * To get back to userland, we need the return %rip in %rcx and
658 660 * the return %rfl in %r11d. The sysretq instruction also arranges
659 661 * to fix up %cs and %ss; everything else is our responsibility.
660 662 */
661 663 movq REGOFF_RDI(%rsp), %rdi
662 664 movq REGOFF_RSI(%rsp), %rsi
663 665 movq REGOFF_RDX(%rsp), %rdx
664 666 /* %rcx used to restore %rip value */
665 667
666 668 movq REGOFF_R8(%rsp), %r8
667 669 movq REGOFF_R9(%rsp), %r9
668 670 movq REGOFF_RAX(%rsp), %rax
669 671 movq REGOFF_RBX(%rsp), %rbx
670 672
671 673 movq REGOFF_RBP(%rsp), %rbp
672 674 movq REGOFF_R10(%rsp), %r10
673 675 /* %r11 used to restore %rfl value */
674 676 movq REGOFF_R12(%rsp), %r12
675 677
676 678 movq REGOFF_R13(%rsp), %r13
677 679 movq REGOFF_R14(%rsp), %r14
678 680 movq REGOFF_R15(%rsp), %r15
679 681
680 682 movq REGOFF_RIP(%rsp), %rcx
681 683 movl REGOFF_RFL(%rsp), %r11d
682 684
683 685 #if defined(__xpv)
684 686 addq $REGOFF_RIP, %rsp
685 687 #else
686 688 movq REGOFF_RSP(%rsp), %rsp
687 689 #endif
688 690
689 691 /*
690 692 * There can be no instructions between the ALTENTRY below and
691 693 * SYSRET or we could end up breaking brand support. See label usage
692 694 * in sn1_brand_syscall_callback for an example.
693 695 */
694 696 ASSERT_UPCALL_MASK_IS_SET
695 697 #if defined(__xpv)
696 698 SYSRETQ
697 699 ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
698 700
699 701 /*
700 702 * We can only get here after executing a brand syscall
701 703 * interposition callback handler and simply need to
702 704 * "sysretq" back to userland. On the hypervisor this
703 705 * involves the iret hypercall which requires us to construct
704 706 * just enough of the stack needed for the hypercall.
705 707 * (rip, cs, rflags, rsp, ss).
706 708 */
707 709 movq %rsp, %gs:CPU_RTMP_RSP /* save user's rsp */
708 710 movq %gs:CPU_THREAD, %r11
709 711 movq T_STACK(%r11), %rsp
710 712
711 713 movq %rcx, REGOFF_RIP(%rsp)
712 714 movl $UCS_SEL, REGOFF_CS(%rsp)
713 715 movq %gs:CPU_RTMP_RSP, %r11
714 716 movq %r11, REGOFF_RSP(%rsp)
715 717 pushfq
716 718 popq %r11 /* hypercall enables ints */
717 719 movq %r11, REGOFF_RFL(%rsp)
718 720 movl $UDS_SEL, REGOFF_SS(%rsp)
719 721 addq $REGOFF_RIP, %rsp
720 722 /*
721 723 * XXPV: see comment in SYSRETQ definition for future optimization
722 724 * we could take.
723 725 */
724 726 ASSERT_UPCALL_MASK_IS_SET
725 727 SYSRETQ
726 728 #else
727 729 ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
728 730 jmp tr_sysretq
729 731 #endif
730 732 /*NOTREACHED*/
731 733 SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
732 734
733 735 _syscall_pre:
734 736 call pre_syscall
735 737 movl %eax, %r12d
736 738 testl %eax, %eax
737 739 jne _syscall_post_call
738 740 /*
739 741 * Didn't abort, so reload the syscall args and invoke the handler.
740 742 */
741 743 movzwl T_SYSNUM(%r15), %eax
742 744 jmp _syscall_invoke
743 745
744 746 _syscall_ill:
745 747 call nosys
746 748 movq %rax, %r12
747 749 movq %rdx, %r13
748 750 jmp _syscall_post_call
749 751
750 752 _syscall_post:
751 753 STI
752 754 /*
753 755 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
754 756 * so that we can account for the extra work it takes us to finish.
755 757 */
756 758 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
757 759 _syscall_post_call:
758 760 movq %r12, %rdi
759 761 movq %r13, %rsi
760 762 call post_syscall
761 763 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
762 764 jmp _sys_rtt
763 765 SET_SIZE(sys_syscall)
764 766 SET_SIZE(brand_sys_syscall)
765 767
766 768 #endif /* __lint */
767 769
768 770 #if defined(__lint)
769 771
770 772 /*ARGSUSED*/
771 773 void
772 774 sys_syscall32()
773 775 {}
774 776
775 777 #else /* __lint */
776 778
777 779 ENTRY_NP(brand_sys_syscall32)
778 780 SWAPGS /* kernel gsbase */
779 781 XPV_TRAP_POP
780 782 BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx))
781 783 jmp nopop_sys_syscall32
782 784
783 785 ALTENTRY(sys_syscall32)
784 786 SWAPGS /* kernel gsbase */
785 787 XPV_TRAP_POP
786 788
787 789 nopop_sys_syscall32:
788 790 movl %esp, %r10d
789 791 movq %gs:CPU_THREAD, %r15
790 792 movq T_STACK(%r15), %rsp
791 793 movl %eax, %eax
792 794
793 795 movl $U32CS_SEL, REGOFF_CS(%rsp)
794 796 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */
795 797 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */
796 798 movq %r10, REGOFF_RSP(%rsp)
797 799 movl $UDS_SEL, REGOFF_SS(%rsp)
798 800
799 801 _syscall32_save:
800 802 movl %edi, REGOFF_RDI(%rsp)
801 803 movl %esi, REGOFF_RSI(%rsp)
802 804 movl %ebp, REGOFF_RBP(%rsp)
803 805 movl %ebx, REGOFF_RBX(%rsp)
804 806 movl %edx, REGOFF_RDX(%rsp)
805 807 movl %ecx, REGOFF_RCX(%rsp)
806 808 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */
807 809 movq $0, REGOFF_SAVFP(%rsp)
808 810 movq $0, REGOFF_SAVPC(%rsp)
809 811
810 812 /*
811 813 * Copy these registers here in case we end up stopped with
812 814 * someone (like, say, /proc) messing with our register state.
813 815 * We don't -restore- them unless we have to in update_sregs.
814 816 *
815 817 * Since userland -can't- change fsbase or gsbase directly,
816 818 * we don't bother to capture them here.
817 819 */
818 820 xorl %ebx, %ebx
819 821 movw %ds, %bx
820 822 movq %rbx, REGOFF_DS(%rsp)
821 823 movw %es, %bx
822 824 movq %rbx, REGOFF_ES(%rsp)
823 825 movw %fs, %bx
824 826 movq %rbx, REGOFF_FS(%rsp)
825 827 movw %gs, %bx
826 828 movq %rbx, REGOFF_GS(%rsp)
827 829
828 830 /*
829 831 * If we're trying to use TRAPTRACE though, I take that back: we're
830 832 * probably debugging some problem in the SWAPGS logic and want to know
831 833 * what the incoming gsbase was.
832 834 *
833 835 * Since we already did SWAPGS, record the KGSBASE.
834 836 */
835 837 #if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
836 838 movl $MSR_AMD_KGSBASE, %ecx
837 839 rdmsr
838 840 movl %eax, REGOFF_GSBASE(%rsp)
839 841 movl %edx, REGOFF_GSBASE+4(%rsp)
840 842 #endif
841 843
842 844 /*
843 845 * Application state saved in the regs structure on the stack
844 846 * %eax is the syscall number
845 847 * %rsp is the thread's stack, %r15 is curthread
846 848 * REG_RSP(%rsp) is the user's stack
847 849 */
848 850
849 851 SYSCALL_TRAPTRACE32($TT_SYSC)
850 852
851 853 movq %rsp, %rbp
852 854
853 855 movq T_LWP(%r15), %r14
854 856 ASSERT_NO_RUPDATE_PENDING(%r14)
855 857
856 858 ENABLE_INTR_FLAGS
857 859
858 860 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
859 861 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
860 862
861 863 ASSERT_LWPTOREGS(%r14, %rsp)
862 864
863 865 incq %gs:CPU_STATS_SYS_SYSCALL
864 866
865 867 /*
866 868 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
867 869 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or
868 870 * more succinctly:
869 871 *
870 872 * SA(MAXSYSARGS * sizeof (long)) == 64
871 873 */
872 874 #define SYS_DROP 64 /* drop for args */
873 875 subq $SYS_DROP, %rsp
874 876 movb $LWP_SYS, LWP_STATE(%r14)
875 877 movq %r15, %rdi
876 878 movq %rsp, %rsi
877 879 call syscall_entry
878 880
879 881 /*
880 882 * Fetch the arguments copied onto the kernel stack and put
881 883 * them in the right registers to invoke a C-style syscall handler.
882 884 * %rax contains the handler address.
883 885 *
884 886 * Ideas for making all this go faster of course include simply
885 887 * forcibly fetching 6 arguments from the user stack under lofault
886 888 * protection, reverting to copyin_args only when watchpoints
887 889 * are in effect.
888 890 *
889 891 * (If we do this, make sure that exec and libthread leave
890 892 * enough space at the top of the stack to ensure that we'll
891 893 * never do a fetch from an invalid page.)
892 894 *
893 895 * Lots of ideas here, but they won't really help with bringup B-)
894 896 * Correctness can't wait, performance can wait a little longer ..
↓ open down ↓ |
230 lines elided |
↑ open up ↑ |
895 897 */
896 898
897 899 movq %rax, %rbx
898 900 movl 0(%rsp), %edi
899 901 movl 8(%rsp), %esi
900 902 movl 0x10(%rsp), %edx
901 903 movl 0x18(%rsp), %ecx
902 904 movl 0x20(%rsp), %r8d
903 905 movl 0x28(%rsp), %r9d
904 906
905 - call *SY_CALLC(%rbx)
907 + movq SY_CALLC(%rbx), %rax
908 + INDIRECT_CALL_REG(rax)
906 909
907 910 movq %rbp, %rsp /* pop the args */
908 911
909 912 /*
910 913 * amd64 syscall handlers -always- return a 64-bit value in %rax.
911 914 * On the 32-bit kernel, they always return that value in %eax:%edx
912 915 * as required by the 32-bit ABI.
913 916 *
914 917 * Simulate the same behaviour by unconditionally splitting the
915 918 * return value in the same way.
916 919 */
917 920 movq %rax, %r13
918 921 shrq $32, %r13 /* upper 32-bits into %edx */
919 922 movl %eax, %r12d /* lower 32-bits into %eax */
920 923
921 924 /*
922 925 * Optimistically assume that there's no post-syscall
923 926 * work to do. (This is to avoid having to call syscall_mstate()
924 927 * with interrupts disabled)
925 928 */
926 929 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
927 930
928 931 /*
929 932 * We must protect ourselves from being descheduled here;
930 933 * If we were, and we ended up on another cpu, or another
931 934 * lwp got in ahead of us, it could change the segment
932 935 * registers without us noticing before we return to userland.
933 936 */
934 937 CLI(%r14)
935 938 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
936 939 jne _full_syscall_postsys32
937 940 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
938 941
939 942 /*
940 943 * Clobber %r11 as we check CR0.TS.
941 944 */
↓ open down ↓ |
26 lines elided |
↑ open up ↑ |
942 945 ASSERT_CR0TS_ZERO(%r11)
943 946
944 947 /*
945 948 * Unlike other cases, because we need to restore the user stack pointer
946 949 * before exiting the kernel we must clear the microarch state before
947 950 * getting here. This should be safe because it means that the only
948 951 * values on the bus after this are based on the user's registers and
949 952 * potentially the addresses where we stored them. Given the constraints
950 953 * of sysret, that's how it has to be.
951 954 */
952 - call *x86_md_clear
955 + call x86_md_clear
953 956
954 957 /*
955 958 * To get back to userland, we need to put the return %rip in %rcx and
956 959 * the return %rfl in %r11d. The sysret instruction also arranges
957 960 * to fix up %cs and %ss; everything else is our responsibility.
958 961 */
959 962
960 963 movl %r12d, %eax /* %eax: rval1 */
961 964 movl REGOFF_RBX(%rsp), %ebx
962 965 /* %ecx used for return pointer */
963 966 movl %r13d, %edx /* %edx: rval2 */
964 967 movl REGOFF_RBP(%rsp), %ebp
965 968 movl REGOFF_RSI(%rsp), %esi
966 969 movl REGOFF_RDI(%rsp), %edi
967 970
968 971 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */
969 972 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */
970 973 movl REGOFF_RSP(%rsp), %esp
971 974
972 975 ASSERT_UPCALL_MASK_IS_SET
973 976 ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
974 977 jmp tr_sysretl
975 978 SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
976 979 /*NOTREACHED*/
977 980
978 981 _full_syscall_postsys32:
979 982 STI
980 983 /*
981 984 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
982 985 * so that we can account for the extra work it takes us to finish.
983 986 */
984 987 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
985 988 movq %r15, %rdi
986 989 movq %r12, %rsi /* rval1 - %eax */
987 990 movq %r13, %rdx /* rval2 - %edx */
988 991 call syscall_exit
989 992 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
990 993 jmp _sys_rtt
991 994 SET_SIZE(sys_syscall32)
992 995 SET_SIZE(brand_sys_syscall32)
993 996
994 997 #endif /* __lint */
995 998
996 999 /*
997 1000 * System call handler via the sysenter instruction
998 1001 * Used only for 32-bit system calls on the 64-bit kernel.
999 1002 *
1000 1003 * The caller in userland has arranged that:
1001 1004 *
1002 1005 * - %eax contains the syscall number
1003 1006 * - %ecx contains the user %esp
1004 1007 * - %edx contains the return %eip
1005 1008 * - the user stack contains the args to the syscall
1006 1009 *
1007 1010 * Hardware and (privileged) initialization code have arranged that by
1008 1011 * the time the sysenter instructions completes:
1009 1012 *
1010 1013 * - %rip is pointing to sys_sysenter (below).
1011 1014 * - %cs and %ss are set to kernel text and stack (data) selectors.
1012 1015 * - %rsp is pointing at the lwp's stack
1013 1016 * - interrupts have been disabled.
1014 1017 *
1015 1018 * Note that we are unable to return both "rvals" to userland with
1016 1019 * this call, as %edx is used by the sysexit instruction.
1017 1020 *
1018 1021 * One final complication in this routine is its interaction with
1019 1022 * single-stepping in a debugger. For most of the system call mechanisms, the
1020 1023 * CPU automatically clears the single-step flag before we enter the kernel.
1021 1024 * The sysenter mechanism does not clear the flag, so a user single-stepping
1022 1025 * through a libc routine may suddenly find themself single-stepping through the
1023 1026 * kernel. To detect this, kmdb and trap() both compare the trap %pc to the
1024 1027 * [brand_]sys_enter addresses on each single-step trap. If it finds that we
1025 1028 * have single-stepped to a sysenter entry point, it explicitly clears the flag
1026 1029 * and executes the sys_sysenter routine.
1027 1030 *
1028 1031 * One final complication in this final complication is the fact that we have
1029 1032 * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
1030 1033 * If we enter at brand_sys_sysenter and start single-stepping through the
1031 1034 * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
1032 1035 * kmdb cannot distinguish between that valid single-step and the undesirable
1033 1036 * one mentioned above. To avoid this situation, we simply add a jump over the
1034 1037 * instruction at sys_sysenter to make it impossible to single-step to it.
1035 1038 */
1036 1039 #if defined(__lint)
1037 1040
1038 1041 void
1039 1042 sys_sysenter()
1040 1043 {}
1041 1044
1042 1045 #else /* __lint */
1043 1046
1044 1047 ENTRY_NP(brand_sys_sysenter)
1045 1048 SWAPGS /* kernel gsbase */
1046 1049 ALTENTRY(_brand_sys_sysenter_post_swapgs)
1047 1050
1048 1051 BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
1049 1052 /*
1050 1053 * Jump over sys_sysenter to allow single-stepping as described
1051 1054 * above.
1052 1055 */
1053 1056 jmp _sys_sysenter_post_swapgs
1054 1057
1055 1058 ALTENTRY(sys_sysenter)
1056 1059 SWAPGS /* kernel gsbase */
1057 1060 ALTENTRY(_sys_sysenter_post_swapgs)
1058 1061
1059 1062 movq %gs:CPU_THREAD, %r15
1060 1063
1061 1064 movl $U32CS_SEL, REGOFF_CS(%rsp)
1062 1065 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */
1063 1066 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */
1064 1067 /*
1065 1068 * NOTE: none of the instructions that run before we get here should
1066 1069 * clobber bits in (R)FLAGS! This includes the kpti trampoline.
1067 1070 */
1068 1071 pushfq
1069 1072 popq %r10
1070 1073 movl $UDS_SEL, REGOFF_SS(%rsp)
1071 1074
1072 1075 /*
1073 1076 * Set the interrupt flag before storing the flags to the
1074 1077 * flags image on the stack so we can return to user with
1075 1078 * interrupts enabled if we return via sys_rtt_syscall32
1076 1079 */
1077 1080 orq $PS_IE, %r10
1078 1081 movq %r10, REGOFF_RFL(%rsp)
1079 1082
1080 1083 movl %edi, REGOFF_RDI(%rsp)
1081 1084 movl %esi, REGOFF_RSI(%rsp)
1082 1085 movl %ebp, REGOFF_RBP(%rsp)
1083 1086 movl %ebx, REGOFF_RBX(%rsp)
1084 1087 movl %edx, REGOFF_RDX(%rsp)
1085 1088 movl %ecx, REGOFF_RCX(%rsp)
1086 1089 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */
1087 1090 movq $0, REGOFF_SAVFP(%rsp)
1088 1091 movq $0, REGOFF_SAVPC(%rsp)
1089 1092
1090 1093 /*
1091 1094 * Copy these registers here in case we end up stopped with
1092 1095 * someone (like, say, /proc) messing with our register state.
1093 1096 * We don't -restore- them unless we have to in update_sregs.
1094 1097 *
1095 1098 * Since userland -can't- change fsbase or gsbase directly,
1096 1099 * we don't bother to capture them here.
1097 1100 */
1098 1101 xorl %ebx, %ebx
1099 1102 movw %ds, %bx
1100 1103 movq %rbx, REGOFF_DS(%rsp)
1101 1104 movw %es, %bx
1102 1105 movq %rbx, REGOFF_ES(%rsp)
1103 1106 movw %fs, %bx
1104 1107 movq %rbx, REGOFF_FS(%rsp)
1105 1108 movw %gs, %bx
1106 1109 movq %rbx, REGOFF_GS(%rsp)
1107 1110
1108 1111 /*
1109 1112 * If we're trying to use TRAPTRACE though, I take that back: we're
1110 1113 * probably debugging some problem in the SWAPGS logic and want to know
1111 1114 * what the incoming gsbase was.
1112 1115 *
1113 1116 * Since we already did SWAPGS, record the KGSBASE.
1114 1117 */
1115 1118 #if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
1116 1119 movl $MSR_AMD_KGSBASE, %ecx
1117 1120 rdmsr
1118 1121 movl %eax, REGOFF_GSBASE(%rsp)
1119 1122 movl %edx, REGOFF_GSBASE+4(%rsp)
1120 1123 #endif
1121 1124
1122 1125 /*
1123 1126 * Application state saved in the regs structure on the stack
1124 1127 * %eax is the syscall number
1125 1128 * %rsp is the thread's stack, %r15 is curthread
1126 1129 * REG_RSP(%rsp) is the user's stack
1127 1130 */
1128 1131
1129 1132 SYSCALL_TRAPTRACE($TT_SYSENTER)
1130 1133
1131 1134 movq %rsp, %rbp
1132 1135
1133 1136 movq T_LWP(%r15), %r14
1134 1137 ASSERT_NO_RUPDATE_PENDING(%r14)
1135 1138
1136 1139 ENABLE_INTR_FLAGS
1137 1140
1138 1141 /*
1139 1142 * Catch 64-bit process trying to issue sysenter instruction
1140 1143 * on Nocona based systems.
1141 1144 */
1142 1145 movq LWP_PROCP(%r14), %rax
1143 1146 cmpq $DATAMODEL_ILP32, P_MODEL(%rax)
1144 1147 je 7f
1145 1148
1146 1149 /*
1147 1150 * For a non-32-bit process, simulate a #ud, since that's what
1148 1151 * native hardware does. The traptrace entry (above) will
1149 1152 * let you know what really happened.
1150 1153 */
1151 1154 movq $T_ILLINST, REGOFF_TRAPNO(%rsp)
1152 1155 movq REGOFF_CS(%rsp), %rdi
1153 1156 movq %rdi, REGOFF_ERR(%rsp)
1154 1157 movq %rsp, %rdi
1155 1158 movq REGOFF_RIP(%rsp), %rsi
1156 1159 movl %gs:CPU_ID, %edx
1157 1160 call trap
1158 1161 jmp _sys_rtt
1159 1162 7:
1160 1163
1161 1164 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
1162 1165 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */
1163 1166
1164 1167 ASSERT_LWPTOREGS(%r14, %rsp)
1165 1168
1166 1169 incq %gs:CPU_STATS_SYS_SYSCALL
1167 1170
1168 1171 /*
1169 1172 * Make some space for MAXSYSARGS (currently 8) 32-bit args
1170 1173 * placed into 64-bit (long) arg slots, plus one 64-bit
1171 1174 * (long) arg count, maintaining 16 byte alignment.
1172 1175 */
1173 1176 subq $SYS_DROP, %rsp
1174 1177 movb $LWP_SYS, LWP_STATE(%r14)
1175 1178 movq %r15, %rdi
1176 1179 movq %rsp, %rsi
1177 1180 call syscall_entry
1178 1181
1179 1182 /*
1180 1183 * Fetch the arguments copied onto the kernel stack and put
1181 1184 * them in the right registers to invoke a C-style syscall handler.
↓ open down ↓ |
219 lines elided |
↑ open up ↑ |
1182 1185 * %rax contains the handler address.
1183 1186 */
1184 1187 movq %rax, %rbx
1185 1188 movl 0(%rsp), %edi
1186 1189 movl 8(%rsp), %esi
1187 1190 movl 0x10(%rsp), %edx
1188 1191 movl 0x18(%rsp), %ecx
1189 1192 movl 0x20(%rsp), %r8d
1190 1193 movl 0x28(%rsp), %r9d
1191 1194
1192 - call *SY_CALLC(%rbx)
1195 + movq SY_CALLC(%rbx), %rax
1196 + INDIRECT_CALL_REG(rax)
1193 1197
1194 1198 movq %rbp, %rsp /* pop the args */
1195 1199
1196 1200 /*
1197 1201 * amd64 syscall handlers -always- return a 64-bit value in %rax.
1198 1202 * On the 32-bit kernel, the always return that value in %eax:%edx
1199 1203 * as required by the 32-bit ABI.
1200 1204 *
1201 1205 * Simulate the same behaviour by unconditionally splitting the
1202 1206 * return value in the same way.
1203 1207 */
1204 1208 movq %rax, %r13
1205 1209 shrq $32, %r13 /* upper 32-bits into %edx */
1206 1210 movl %eax, %r12d /* lower 32-bits into %eax */
1207 1211
1208 1212 /*
1209 1213 * Optimistically assume that there's no post-syscall
1210 1214 * work to do. (This is to avoid having to call syscall_mstate()
1211 1215 * with interrupts disabled)
1212 1216 */
1213 1217 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
1214 1218
1215 1219 /*
1216 1220 * We must protect ourselves from being descheduled here;
1217 1221 * If we were, and we ended up on another cpu, or another
1218 1222 * lwp got int ahead of us, it could change the segment
1219 1223 * registers without us noticing before we return to userland.
1220 1224 *
1221 1225 * This cli is undone in the tr_sysexit trampoline code.
1222 1226 */
1223 1227 cli
1224 1228 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
1225 1229 jne _full_syscall_postsys32
1226 1230 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
1227 1231
1228 1232 /*
1229 1233 * To get back to userland, load up the 32-bit registers and
1230 1234 * sysexit back where we came from.
1231 1235 */
1232 1236
1233 1237 /*
1234 1238 * Interrupts will be turned on by the 'sti' executed just before
1235 1239 * sysexit. The following ensures that restoring the user's rflags
1236 1240 * doesn't enable interrupts too soon.
1237 1241 */
1238 1242 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp)
1239 1243
1240 1244 /*
1241 1245 * Clobber %r11 as we check CR0.TS.
1242 1246 */
1243 1247 ASSERT_CR0TS_ZERO(%r11)
1244 1248
1245 1249 /*
1246 1250 * (There's no point in loading up %edx because the sysexit
1247 1251 * mechanism smashes it.)
1248 1252 */
1249 1253 movl %r12d, %eax
↓ open down ↓ |
47 lines elided |
↑ open up ↑ |
1250 1254 movl REGOFF_RBX(%rsp), %ebx
1251 1255 movl REGOFF_RBP(%rsp), %ebp
1252 1256 movl REGOFF_RSI(%rsp), %esi
1253 1257 movl REGOFF_RDI(%rsp), %edi
1254 1258
1255 1259 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */
1256 1260 pushq REGOFF_RFL(%rsp)
1257 1261 popfq
1258 1262 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */
1259 1263 ALTENTRY(sys_sysenter_swapgs_sysexit)
1260 - call *x86_md_clear
1264 + call x86_md_clear
1261 1265 jmp tr_sysexit
1262 1266 SET_SIZE(sys_sysenter_swapgs_sysexit)
1263 1267 SET_SIZE(sys_sysenter)
1264 1268 SET_SIZE(_sys_sysenter_post_swapgs)
1265 1269 SET_SIZE(brand_sys_sysenter)
1266 1270
1267 1271 #endif /* __lint */
1268 1272
1269 1273 /*
1270 1274 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1271 1275 * the generic i386 libc to do system calls. We do a small amount of setup
1272 1276 * before jumping into the existing sys_syscall32 path.
1273 1277 */
1274 1278 #if defined(__lint)
1275 1279
1276 1280 /*ARGSUSED*/
1277 1281 void
1278 1282 sys_syscall_int()
1279 1283 {}
1280 1284
1281 1285 #else /* __lint */
1282 1286
1283 1287 ENTRY_NP(brand_sys_syscall_int)
1284 1288 SWAPGS /* kernel gsbase */
1285 1289 XPV_TRAP_POP
1286 1290 call smap_enable
1287 1291 BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK())
1288 1292 jmp nopop_syscall_int
1289 1293
1290 1294 ALTENTRY(sys_syscall_int)
1291 1295 SWAPGS /* kernel gsbase */
1292 1296 XPV_TRAP_POP
1293 1297 call smap_enable
1294 1298
1295 1299 nopop_syscall_int:
1296 1300 movq %gs:CPU_THREAD, %r15
1297 1301 movq T_STACK(%r15), %rsp
1298 1302 movl %eax, %eax
1299 1303 /*
1300 1304 * Set t_post_sys on this thread to force ourselves out via the slow
1301 1305 * path. It might be possible at some later date to optimize this out
1302 1306 * and use a faster return mechanism.
1303 1307 */
1304 1308 movb $1, T_POST_SYS(%r15)
1305 1309 CLEAN_CS
1306 1310 jmp _syscall32_save
↓ open down ↓ |
36 lines elided |
↑ open up ↑ |
1307 1311 /*
1308 1312 * There should be no instructions between this label and SWAPGS/IRET
1309 1313 * or we could end up breaking branded zone support. See the usage of
1310 1314 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
1311 1315 * for examples.
1312 1316 *
1313 1317 * We want to swapgs to maintain the invariant that all entries into
1314 1318 * tr_iret_user are done on the user gsbase.
1315 1319 */
1316 1320 ALTENTRY(sys_sysint_swapgs_iret)
1317 - call *x86_md_clear
1321 + call x86_md_clear
1318 1322 SWAPGS
1319 1323 jmp tr_iret_user
1320 1324 /*NOTREACHED*/
1321 1325 SET_SIZE(sys_sysint_swapgs_iret)
1322 1326 SET_SIZE(sys_syscall_int)
1323 1327 SET_SIZE(brand_sys_syscall_int)
1324 1328
1325 1329 #endif /* __lint */
1326 1330
1327 1331 /*
1328 1332 * Legacy 32-bit applications and old libc implementations do lcalls;
1329 1333 * we should never get here because the LDT entry containing the syscall
1330 1334 * segment descriptor has the "segment present" bit cleared, which means
1331 1335 * we end up processing those system calls in trap() via a not-present trap.
1332 1336 *
1333 1337 * We do it this way because a call gate unhelpfully does -nothing- to the
1334 1338 * interrupt flag bit, so an interrupt can run us just after the lcall
1335 1339 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and
1336 1340 * INTR_POP paths would have to be slightly more complex to dance around
1337 1341 * this problem, and end up depending explicitly on the first
1338 1342 * instruction of this handler being either swapgs or cli.
1339 1343 */
1340 1344
1341 1345 #if defined(__lint)
1342 1346
1343 1347 /*ARGSUSED*/
1344 1348 void
1345 1349 sys_lcall32()
1346 1350 {}
1347 1351
1348 1352 #else /* __lint */
1349 1353
1350 1354 ENTRY_NP(sys_lcall32)
1351 1355 SWAPGS /* kernel gsbase */
1352 1356 pushq $0
1353 1357 pushq %rbp
1354 1358 movq %rsp, %rbp
1355 1359 leaq __lcall_panic_str(%rip), %rdi
1356 1360 xorl %eax, %eax
1357 1361 call panic
1358 1362 SET_SIZE(sys_lcall32)
1359 1363
1360 1364 __lcall_panic_str:
1361 1365 .string "sys_lcall32: shouldn't be here!"
1362 1366
1363 1367 /*
1364 1368 * Declare a uintptr_t which covers the entire pc range of syscall
1365 1369 * handlers for the stack walkers that need this.
1366 1370 */
1367 1371 .align CPTRSIZE
1368 1372 .globl _allsyscalls_size
1369 1373 .type _allsyscalls_size, @object
1370 1374 _allsyscalls_size:
1371 1375 .NWORD . - _allsyscalls
1372 1376 SET_SIZE(_allsyscalls_size)
1373 1377
1374 1378 #endif /* __lint */
1375 1379
1376 1380 /*
1377 1381 * These are the thread context handlers for lwps using sysenter/sysexit.
1378 1382 */
1379 1383
1380 1384 #if defined(__lint)
1381 1385
1382 1386 /*ARGSUSED*/
1383 1387 void
1384 1388 sep_save(void *ksp)
1385 1389 {}
1386 1390
1387 1391 /*ARGSUSED*/
1388 1392 void
1389 1393 sep_restore(void *ksp)
1390 1394 {}
1391 1395
1392 1396 #else /* __lint */
1393 1397
1394 1398 /*
1395 1399 * setting this value to zero as we switch away causes the
1396 1400 * stack-pointer-on-sysenter to be NULL, ensuring that we
1397 1401 * don't silently corrupt another (preempted) thread stack
1398 1402 * when running an lwp that (somehow) didn't get sep_restore'd
1399 1403 */
1400 1404 ENTRY_NP(sep_save)
1401 1405 xorl %edx, %edx
1402 1406 xorl %eax, %eax
1403 1407 movl $MSR_INTC_SEP_ESP, %ecx
1404 1408 wrmsr
1405 1409 ret
1406 1410 SET_SIZE(sep_save)
1407 1411
1408 1412 /*
1409 1413 * Update the kernel stack pointer as we resume onto this cpu.
1410 1414 */
1411 1415 ENTRY_NP(sep_restore)
1412 1416 movq %rdi, %rdx
1413 1417 shrq $32, %rdx
1414 1418 movl %edi, %eax
1415 1419 movl $MSR_INTC_SEP_ESP, %ecx
1416 1420 wrmsr
1417 1421 ret
1418 1422 SET_SIZE(sep_restore)
1419 1423
1420 1424 #endif /* __lint */
↓ open down ↓ |
93 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX