linux2.4.18----26.由内核态切换到用户态-wangcong02345-ChinaUnix博客

一.总体分析
在start_kernel的最后会调用rest_init-->kenel_thread 创建出一个进程init
init -->execve("/bin/sh", argv,env) 的最后在返回时,会将内核态的各个段寄存器设置为用户态的各个段寄存器
最后调用iret就从内核态切换到了用户态
二.代码分析
2.1 内核空间:在init/main.c中init进程的创建与执行

static void rest_init(void)
{
//kernel_thread最终会do_fork创建了一个进程,其pid=1
//创建完进程后系统调用返回，然后就会执行init函数
kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
unlock_kernel();
current->need_resched = 1;
cpu_idle();
}

在arch/i386/kernel/process.c中 L488,其中eax=NR_clone

int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
{
long retval, d0;
//fn=init=0xc0105044,arg=NULL, flags=0x10e00
__asm__ __volatile__(
“movl _NR_clone, %eax" //将系统调用号eax设为NR_clone
"movl flags|CLONE_VM, %ebx" //将flags放在ebx中
"movl %%esp,%%esi"
"int $0x80" //第1个系统调用
"cmpl %%esp,%%esi" //fork之后子进程会获得新esp,但父进程还是用它原先的esp,根据这一点可以区分父子进程
"je 1f " //相等则为父进程，直接跳出 parent-->jump
"movl %4,%%eax" //子进程:将args压栈
"pushl %%eax" //子进程:
"call *%5" //子进程:调用fn=init函数
"movl %3,%0" //子进程执行完fn=init函数后调用exit
"int $0x80" //子进程:第2个系统调用exit
"1:\t"
:"=&a" (retval), "=&S" (d0)
:"0" (__NR_clone), "i" (__NR_exit),
"r" (arg), "r" (fn),
"b" (flags | CLONE_VM)
: "memory");
return retval;
}

2.2 内核空间:在init/main.c中init进程的创建与执行

static int init(void * unused)
{
...... //不关心
if (execute_command) //这个execute_command=/bin/sh
execve(execute_command,argv_init,envp_init);
execve("/sbin/init",argv_init,envp_init); //后面这个不会执行
execve("/etc/init",argv_init,envp_init);
execve("/bin/init",argv_init,envp_init);
execve("/bin/sh",argv_init,envp_init);
panic("No init found. Try passing init= option to kernel.");
}

2.2.1 内核空间: execve是一个系统调用
在arch/i386/kernel/entry.S中L194

ENTRY(system_call)
pushl %eax
SAVE_ALL //2.2.1.1保存所有的寄存器
GET_CURRENT(%ebx) //获取current指针
testb $0x02,tsk_ptrace(%ebx) //检查系统调用号是不是越界
jne tracesys
cmpl $(NR_syscalls),%eax
jae badsys
call *SYMBOL_NAME(sys_call_table)(,%eax,4) //这儿是调用sys_execve
movl %eax,EAX(%esp) //将返回值保存在esp+6th
ENTRY(ret_from_sys_call)
cli # need_resched and signals atomic test
cmpl $0,need_resched(%ebx)
jne reschedule
cmpl $0,sigpending(%ebx)
jne signal_return
restore_all:
RESTORE_ALL

2.2.1.1在linux-2.4.18/arch/i386/kernel/entry.S中

#define SAVE_ALL \
cld; \
pushl %es; \
pushl %ds; \
pushl %eax; \
pushl %ebp; \
pushl %edi; \
pushl %esi; \
pushl %edx; \
pushl %ecx; \
pushl %ebx; \
movl $(__KERNEL_DS),%edx; \
movl %edx,%ds; \
movl %edx,%es

执行完SAVE_ALL之后的寄存器如下所示

(gdb) info r
eax 0xb 11
ecx 0xc03ac1c0
edx 0xc03ac200
ebx 0xc0447fd3
esp 0xf7deffa4 //这个esp就是下面的struct pt_regs regs
ebp 0xe000
esi 0xc03c7fc4
edi 0xc01051c8
eip 0xc010928b 0xc010928b <system_call+11>
eflags 0x286 [ PF SF IF ]
cs 0x10 16
ss 0x18 24
ds 0x18 24
es 0x18 24
fs 0x18 24
gs 0x18 24
(gdb) x /32wx 0xf7deffa4
0xf7deffa4: 0xc0447fd3 0xc03ac1c0 0xc03ac200 0xc03c7fc4
0xf7deffb4: 0xc01051c8 0x0000e000 0x0000000b 0x00000018
0xf7deffc4: 0x00000018 0x0000000b 0xc010558a 0x00000010
0xf7deffd4: 0x00000286 0x00000010 0x00010f00 0xc0105235
0xf7deffe4: 0xc0447fd3 0xc03ac1c0 0xc03ac200 0xc0105c4b
0xf7defff4: 0x00000000 0x00000078 0xc0124774 0x00000000
0xf7df0004: 0x00000000 0x00000000 0x00000000 0x00000000
0xf7df0014: 0x00000000 0x00000000 0x00000000 0x00000000
下面这个是进入do_execve之后读出来的寄存器
(gdb) p /x regs
$6 = {ebx=0xc0447fd3, ecx=0xc03ac1c0, edx=0xc03ac200, esi=0xc03c7fc4,
edi=0xc01051c8, ebp=0xe000, eax=0xb, xds=0x18,
xes=0x18, orig_eax=0xb, eip=0xc010558a, xcs = 0x10,
eflags=0x286, esp=0x10, xss=0x10f00}

2.3在arch/i386/kernel/process.c中-->系统调用sys_execve

asmlinkage int sys_execve(struct pt_regs regs)
{
int error;
char * filename;
(gdb) p ?s
$5 = (struct pt_regs *) 0xf7deffa4 //有没有发现这个数值似曾相识?对，这个就是esp
(gdb) p /x regs
$6 = {ebx=0xc0447fd3, ecx=0xc03ac1c0, edx=0xc03ac200, esi=0xc03c7fc4, edi=0xc01051c8, ebp=0xe000, eax=0xb,
xds=0x18, xes=0x18, orig_eax=0xb, eip=0xc010558a, xcs = 0x10, eflags=0x286, esp=0x10, xss=0x10f00}
filename = getname((char *) regs.ebx);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, ?s);
if (error == 0)
current->ptrace &= ~PT_DTRACE;
(gdb) p /x regs -->执行完do_execve之后再看堆栈的情况，特别注意段寄存器的变化
$1 = {ebx = 0x0, ecx = 0x0, edx = 0x0, esi = 0x0, edi = 0x0, ebp = 0x0, eax = 0x0, xds = 0x2b, xes = 0x2b,
orig_eax = 0xb, eip = 0x40000be0, xcs = 0x23, eflags = 0x286, esp = 0xbfffff40, xss = 0x2b }
putname(filename);
out:
return error;
}

注意: 这儿并没有为struct pt_resg regs分配内存空间, regs虽然不是指针,但其代表的意思和指针是一样的
regs指向己压入栈的所有寄存器，改变了regs的值就是改变了己压栈的寄存器的值，等pop时会真正改变寄存器的值
2.4 再次回到system_call

ENTRY(system_call)
pushl %eax # save orig_eax
SAVE_ALL
GET_CURRENT(%ebx)
testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
jne tracesys
cmpl $(NR_syscalls),%eax
jae badsys
call *SYMBOL_NAME(sys_call_table)(,%eax,4) -->执行do_execve
movl %eax,EAX(%esp) -->do_execve执行完成后返回
ENTRY(ret_from_sys_call)
cli # need_resched and signals atomic test
cmpl $0,need_resched(%ebx)
jne reschedule
cmpl $0,sigpending(%ebx)
jne signal_return
restore_all:
RESTORE_ALL -->到这儿开始恢复寄存器的值

注意:SAVE_ALL与RESTORE_ALL表面上看起来一个push，一个pop,里面的寄存器名称也一样
但是就是在do_execve中改变了压栈中的寄存器，所以这个RESTORE_ALL就不是一般的返回了。
2.5关于RESTORE_ALL

#define RESTORE_ALL \
popl %ebx; \ //将SAVE_ALL中压栈的寄存器恢复
popl %ecx; \
popl %edx; \
popl %esi; \
popl %edi; \
popl %ebp; \
popl %eax; \
1: popl %ds; \
2: popl %es; \
addl $4,%esp; \ //system_call一开始压栈的eax直接加4丢掉了
3: iret; \ //从内核空间返回用户空间，iret的出栈是有顺序的
.section .fixup,"ax"; \
4: movl $0,(%esp); \
jmp 1b; \
5: movl $0,(%esp); \
jmp 2b; \
6: pushl %ss; \
popl %ds; \
pushl %ss; \
popl %es; \
pushl $11; \
call do_exit; \
.previous; \
.section __ex_table,"a";\
.align 4; \
.long 1b,4b; \
.long 2b,5b; \
.long 3b,6b; \
.previous

a. 关于iret的出栈顺序
intel手册中IRET只会将EIP，CS，EFLAGS弹出，但是当有特权级的切换时，SS:ESP也被弹出
iret之后的出栈顺序是固定的，如下:
EIP --> CS --> EFLAGS --> ESP --> SS
b.执行iret之前，寄存器与栈中的值如下所示

(gdb) info r
eax 0x0 0
ecx 0x0 0
edx 0x0 0
ebx 0x0 0
esp 0xf7deffcc 0xf7deffcc
ebp 0x0 0x0
esi 0x0 0
edi 0x0 0
eip 0xc01092dd 0xc01092dd <restore_all+12>
eflags 0x86 [ PF SF ]
cs 0x10 16
ss 0x18 24
ds 0x2b 43
es 0x2b 43
fs 0x0 0
gs 0x0 0
(gdb) x /32wx 0xf7deffcc
0xf7deffcc: 0x40000be0 0x00000023 0x00000286 0xbfffff40
EIP CS EFLAGS ESP
0xf7deffdc: 0x0000002b 0xc0105235 0xc0447fd3 0xc03ac1c0
SS

c.执行iret之后，寄存器与栈中的值如下所示

1: x/i $pc
=> 0x40000be0: <error: Cannot access memory at address 0x40000be0> //pc切换到了0x40000be0
(gdb) info r
eax 0x0 0
ecx 0x0 0
edx 0x0 0
ebx 0x0 0
esp 0xbfffff40 //有背景颜色的说明寄存器有改变
ebp 0x0 0x0
esi 0x0 0
edi 0x0 0
eip 0x40000be0
eflags 0x286 [ PF SF IF ]
cs 0x23 35
ss 0x2b 43
ds 0x2b 43
es 0x2b 43
fs 0x0 0
gs 0x0 0

解释一下:
SS=0x18=11000=index=3,Ti=0,RPL=0 -->内核态
SS=0x1b=101011=index=5,Ti=0,RPL=3 -->用户态
所以这儿是由内核态切换到了用户态
2.6 还有一个问题struct pt_regs regs中的值是在什么地方改变的？
sys_execve
-->do_execve
-->search_binary_handler
--> load_elf_binary
--> start_thread

#define start_thread(regs, new_eip, new_esp) do { \
__asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
set_fs(USER_DS); \
regs->xds = __USER_DS; =0x2B \
regs->xes = __USER_DS; =0x2B \
regs->xss = __USER_DS; =0x2B \
regs->xcs = __USER_CS; =0x23 \
regs->eip = new_eip; //这个eip=0x40000be0 是不是也很熟悉 \
regs->esp = new_esp; //这个esp=0xbfffff40 \
} while (0)