其他
KERNEL PWN状态切换原理及KPTI绕过
>>>> int 0x80
int 0x80
>>>> 总结一下
总结一下
rsp ---> rip
cs
rflags
rsp
ss
>>>> syscall
syscall
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
ENTRY(entry_SYSCALL_64)
/* SWAPGS_UNSAFE_STACK是一个宏,x86直接定义为swapgs指令 */
SWAPGS_UNSAFE_STACK
/* 保存栈值,并设置内核栈 */
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* 通过push保存寄存器值,形成一个pt_regs结构 */
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx tuichu /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
.......
.......
LOCKDEP_SYS_EXIT // 宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
// 恢复除 rxc 和 r11 外所有通用寄存器, 因为 rcx 寄存器为调用系统调用的应用程序的返回地址, r11 寄存器为老的 flags register
/* 根据压栈的内容,恢复 rsp 为用户态的栈顶 */
movq RSP(%rsp), %rsp
USERGS_SYSRET64
/* 调用宏 USERGS_SYSRET64 ,其扩展调用 swapgs 指令交换用户 GS 和内核GS, sysret 指令执行从系统调用处理退出 */
>>>> 总结一下
总结一下
>>>> before
before
>>>> KPTI
KPTI
>>>> swap CR3
swap CR3
ENTRY(entry_SYSCALL_64)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
// KPTI 进内核态需要切到内核页表
SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
GLOBAL(entry_SYSCALL_64_after_swapgs)
// 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中
movq %rsp, PER_CPU_VAR(rsp_scratch)
// 加载内核栈偏移
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
TRACE_IRQS_OFF
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
// 为r12-r15, rbp, rbx保留位置
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
/*
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
movq PER_CPU_VAR(current_task), %r11
testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath:
/*
* Easy case: enable interrupts and issue the syscall. If the syscall
* needs pt_regs, we'll call a stub that disables interrupts again
* and jumps to the slow path.
*/
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0
// 确保系统调用号没超过最大值,超过了则跳转到后面的符号 1 处进行返回
cmpq $__NR_syscall_max, %rax
#else
andl $__SYSCALL_MASK, %eax
cmpl $__NR_syscall_max, %eax
#endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
// 除系统调用外的其他调用都通过 rcx 来传第四个参数,因此将 r10 的内容设置到 rcx
movq %r10, %rcx
/*
* This call instruction is handled specially in stub_ptregs_64.
* It might end up jumping to the slow path. If it jumps, RAX
* and all argument registers are clobbered.
*/
// 调用系统调用表中对应的函数
call *sys_call_table(, %rax, 8)
.Lentry_SYSCALL_64_after_fastpath_call:
// 将函数返回值压到栈中,返回时弹出
movq %rax, RAX(%rsp)
1:
/*
* If we get here, then we know that pt_regs is clean for SYSRET64.
* If we see that no exit work is required (which we are required
* to check with IRQs off), then we can go straight to SYSRET64.
*/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movq PER_CPU_VAR(current_task), %r11
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
LOCKDEP_SYS_EXIT // 宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
// 恢复除 rxc 和 r11 外所有通用寄存器, 因为 rcx 寄存器为调用系统调用的应用程序的返回地址, r11 寄存器为老的 flags register
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3 // KPTI 返回用户态需要切回用户页表
/* 根据压栈的内容,恢复 rsp 为用户态的栈顶 */
movq RSP(%rsp), %rsp
USERGS_SYSRET64
/* 调用宏 USERGS_SYSRET64 ,其扩展调用 swapgs 指令交换用户 GS 和内核GS, sysret 指令执行从系统调用处理退出 */
........
........
mov rdi, cr3
nop
nop
nop
nop
nop
and rdi, 0xFFFFFFFFFFFFE7FF
mov cr3, rdi
mov rdi, cr3
or rdi, 1000h
mov cr3, rdi
mov rdi, cr3
or rdi, 1000h
mov cr3, rdi
arch/x86/entry/entry_64.S
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
POP_REGS pop_rdi=0
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */
/* Push user RDI on the trampoline stack. */
pushq (%rdi)
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
/* Restore RDI. */
popq %rdi
SWAPGS
INTERRUPT_RETURN
swapgs_restore_regs_and_return_to_usermode
.text:FFFFFFFF81600A34 41 5F pop r15
.text:FFFFFFFF81600A36 41 5E pop r14
.text:FFFFFFFF81600A38 41 5D pop r13
.text:FFFFFFFF81600A3A 41 5C pop r12
.text:FFFFFFFF81600A3C 5D pop rbp
.text:FFFFFFFF81600A3D 5B pop rbx
.text:FFFFFFFF81600A3E 41 5B pop r11
.text:FFFFFFFF81600A40 41 5A pop r10
.text:FFFFFFFF81600A42 41 59 pop r9
.text:FFFFFFFF81600A44 41 58 pop r8
.text:FFFFFFFF81600A46 58 pop rax
.text:FFFFFFFF81600A47 59 pop rcx
.text:FFFFFFFF81600A48 5A pop rdx
.text:FFFFFFFF81600A49 5E pop rsi
.text:FFFFFFFF81600A4A 48 89 E7 mov rdi, rsp <<<<<<<<<<<<<<<<<<
.text:FFFFFFFF81600A4D 65 48 8B 24 25+ mov rsp, gs: 0x5004
.text:FFFFFFFF81600A56 FF 77 30 push qword ptr [rdi+30h]
.text:FFFFFFFF81600A59 FF 77 28 push qword ptr [rdi+28h]
.text:FFFFFFFF81600A5C FF 77 20 push qword ptr [rdi+20h]
.text:FFFFFFFF81600A5F FF 77 18 push qword ptr [rdi+18h]
.text:FFFFFFFF81600A62 FF 77 10 push qword ptr [rdi+10h]
.text:FFFFFFFF81600A65 FF 37 push qword ptr [rdi]
.text:FFFFFFFF81600A67 50 push rax
.text:FFFFFFFF81600A68 EB 43 nop
.text:FFFFFFFF81600A6A 0F 20 DF mov rdi, cr3
.text:FFFFFFFF81600A6D EB 34 jmp 0xFFFFFFFF81600AA3
.text:FFFFFFFF81600AA3 48 81 CF 00 10+ or rdi, 1000h
.text:FFFFFFFF81600AAA 0F 22 DF mov cr3, rdi
.text:FFFFFFFF81600AAD 58 pop rax
.text:FFFFFFFF81600AAE 5F pop rdi
.text:FFFFFFFF81600AAF FF 15 23 65 62+ call cs: SWAPGS
.text:FFFFFFFF81600AB5 FF 25 15 65 62+ jmp cs: INTERRUPT_RETURN
_SWAPGS
.text:FFFFFFFF8103EFC0 55 push rbp
.text:FFFFFFFF8103EFC1 48 89 E5 mov rbp, rsp
.text:FFFFFFFF8103EFC4 0F 01 F8 swapgs
.text:FFFFFFFF8103EFC7 5D pop rbp
.text:FFFFFFFF8103EFC8 C3 retn
_INTERRUPT_RETURN
.text:FFFFFFFF81600AE0 F6 44 24 20 04 test byte ptr [rsp+0x20], 4
.text:FFFFFFFF81600AE5 75 02 jnz native_irq_return_ldt
.text:FFFFFFFF81600AE7 48 CF iretq
rsp ----> mov_rdi_rsp
0
0
rip
cs
rflags
rsp
ss
https://ubuntuqa.com/article/1242.html
Linux系统调用过程分析 - 知乎
https://zhuanlan.zhihu.com/p/79236207?from_voters_page=true
https://blog.csdn.net/juS3Ve/article/details/79544927
https://www.felixcloutier.com/x86
看雪ID:b0ldfrev
https://bbs.pediy.com/user-793907.htm
推荐文章++++
好书推荐