除0异常 signal(SIGFPE, sig_fpe)-小霸王88-ChinaUnix博客

最近在论坛里遇见有人问这样的问题
signal(SIGFPE, sig_fpe); 问题是为什么发生除0异常的时候，会反复调用 sig_fpe 函数。
我整理了点相关资料和解决办法
(gdb) list
11              printf("%d\n",a);
12              int i=a/b;
13              printf("%d\n",c);
14              return 0;
15      }
16
17      static void sig_fpe(int sig)
18      {
19              printf("no:%d,%s\n",sig,"not 0";
20      }
(gdb) step
Program received signal SIGFPE, Arithmetic exception.
0x0804865b in main () at sig.cpp:12
12              int i=a/b;
(gdb) step
sig_cl (sig=8 ) at sig.cpp:17
17      static void sig_fpe(int sig)
(gdb) step
19              printf("no:%d,%s\n",sig,"not 0";
(gdb) step
no:8,not 0
20      }
(gdb) step
Program received signal SIGFPE, Arithmetic exception.
0x0804865b in main () at sig.cpp:12
12              int i=a/b;
(gdb) step
Dump of assembler code for function main:
0x08048606 :    lea    0x4(%esp),%ecx
0x0804860a :    and    $0xfffffff0,%esp
0x0804860d :    pushl 0xfffffffc(%ecx)
0x08048610 :   push   %ebp
0x08048611 :   mov    %esp,%ebp
0x08048613 :   push   %ecx
0x08048614 :   sub    $0x24,%esp
0x08048617 :   movl   $0x8048682,0x4(%esp)
0x0804861f :   movl   $0x8,(%esp)
0x08048626 :   call   0x8048478
0x0804862b :   movl   $0xa,0xffffffec(%ebp)
0x08048632 :   movl   $0x0,0xfffffff0(%ebp)
0x08048639 :   movl   $0x2,0xfffffff4(%ebp)
0x08048640 :   mov    0xffffffec(%ebp),%eax
0x08048643 :   mov    %eax,0x4(%esp)
0x08048647 :   movl   $0x804877c,(%esp)
0x0804864e :   call   0x80484c8
0x08048653 :   mov    0xffffffec(%ebp),%edx
0x08048656 :   mov    %edx,%eax
0x08048658 :   sar    $0x1f,%edx
0x0804865b :   idivl 0xfffffff0(%ebp)
0x0804865e :   mov    %eax,0xfffffff8(%ebp)
---Type to continue, or q to quit---
0x08048661 :   mov    0xfffffff4(%ebp),%eax
0x08048664 :   mov    %eax,0x4(%esp)
0x08048668 :   movl   $0x804877c,(%esp)
0x0804866f : call   0x80484c8
0x08048674 : mov    $0x0,%eax
0x08048679 : add    $0x24,%esp
0x0804867c : pop    %ecx
0x0804867d : pop    %ebp
0x0804867e : lea    0xfffffffc(%ecx),%esp
0x08048681 : ret
End of assembler dump.

其实发生异常时，CPU（x86）会将当前 EIP (指向引发异常的指令)压栈，
发生中断时，CPU将当前 EIP 的“后”一个地址(指向引发中断的指令的后一条指令)压栈。
在异常处理代码中，如果认为能够从灾难中恢复，可以不修改被压栈的EIP，从而返回到引发异常的指令处。

《这个在Intel手册里应该能找到详细资料》

其实避免方法也很简单，就是利用远跳转转移，避开引发异常的指令。
这样 sig_fpe 就只会执行一次。
c code 如下：
#include
#include
#include

static sigjmp_buf jmpbuf;
void sig_fpe( int );

int main(int ac, char *av[])
{
        int a = 10,     b = 0;
        signal(SIGFPE, sig_fpe);
        if (sigsetjmp(jmpbuf, 1) == 0)// try
        {
                int i = a / b;           //    int i = 10 / 0;
        }
        else                          // catch
        {
                printf("catch exception\n");
        }
}
void sig_fpe(int signo)
{
        siglongjmp(jmpbuf, 1)
}

下面一段分析摘自互联网，仅供参考：

内核代码版本 2.6.19-rc1-git7

这些同步异常的处理在CPU的中断表中有固定的位置, 如除0, 对应第0项.
内核初始化时调用trap_init()初始化同步异常中断表.

set_trap_gate(0,÷_error);

当用户空间除0, CPU自动跳转到内核态,并将当前寄存器信息压入内核堆栈, 然后调用entry.s中的divide_error()
我一直认为, entry.s是内核中最重要的一个文件, 完全理解这个文件, 对理解整个内核的运转非常关键.
ENTRY(divide_error)
        RING0_INT_FRAME
        pushl $0                        # no error code
        CFI_ADJUST_CFA_OFFSET 4
        pushl $do_divide_error                //将执行函数压栈
        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
error_code:
        ......
        call *%edi
        jmp ret_from_exception
        CFI_ENDPROC

所以, 除0错误在do_divide_error()中处理, 执行完后, 执行ret_from_exception().
很明显,对用户空间任务的处理不应该在do_divide_error()中实现, 因为这个函数执行在异常上下文中,优先级非常高.

do_divide_error()
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
fastcall void do_##name(struct pt_regs * regs, long error_code) \
{ \
        /* 设置SIGINFO信息, 如果用户空间使用sigaction()挂了自己的信号处理函数,
           且flag & SIGINFO, 可以接收到这些信息 */
        siginfo_t info; \
        info.si_signo = signr; \
        info.si_errno = 0; \
        info.si_code = sicode; \
        info.si_addr = (void __user *)siaddr; \
        /* 执行内核设置的监控函数, 如kprobe, oprofile等 */
        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
                                                == NOTIFY_STOP) \
                return; \
        /* 处理这个异常 */
        do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
}

static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
                              struct pt_regs * regs, long error_code,
                              siginfo_t *info)
{
        struct task_struct *tsk = current;
        tsk->thread.error_code = error_code;
        tsk->thread.trap_no = trapnr;

        if (regs->eflags & VM_MASK) {
                if (vm86)
                        goto vm86_trap;
                goto trap_signal;
        }
        /* 很明显,如果内核出现除0的错误,说明内核代码有bug,必须触发oops */
        if (!user_mode(regs))
                goto kernel_trap;

        /* 如果仅仅是用户空间任务除0, 发送信号给那个程序,
           而不需要触发内核bug. 发送信号的过程仅仅是将当前任务的thread_info flag
           添加上sigpending, 并将info信息添加到task->pending列表中 */
        trap_signal: {
                if (info)
                        force_sig_info(signr, info, tsk);
                else
                        force_sig(signr, tsk);
                return; //用户出错, 直接返回
        }
        kernel_trap: {
                if (!fixup_exception(regs))
                        die(str, regs, error_code);
                return;
        }
        vm86_trap: {
                int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
                if (ret) goto trap_signal;
                return;
        }
}
所以直接返回到ret_from_exception() (entry.s)
当用户空间任务从内核返回, 都会执行这样的路径:
ret_from_xxx() --> check_userspace() --> resume_userspace()
在resume_userspace()中:
ENTRY(resume_userspace)
         DISABLE_INTERRUPTS                # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        movl TI_flags(%ebp), %ecx
        andl $_TIF_WORK_MASK, %ecx        # 信号被设置, 会触发这个条件
        jne work_pending
        jmp restore_all

work_pending:
        testb $_TIF_NEED_RESCHED, %cl #如果当前任务同时被设置了需要重新调度,
                                       #为保证实时性,会首先执行重新调度,但在除0异常中%不可能触发这个,
                                       #因为中断始终没有放开
        jz work_notifysig
work_resched:
        call schedule
        DISABLE_INTERRUPTS                # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        andl $_TIF_WORK_MASK, %ecx        # is there any work to be done other
                                        # than syscall tracing?
        jz restore_all
        testb $_TIF_NEED_RESCHED, %cl
        jnz work_resched

work_notifysig:                                # deal with pending signals and
                                        # notify-resume requests
        testl $VM_MASK, EFLAGS(%esp)
        movl %esp, %eax
        jne work_notifysig_v86                # returning to kernel-space or
                                        # vm86-space
        xorl %edx, %edx
        call do_notify_resume                #deliver信号到用户空间!!
        jmp resume_userspace_sig        #循环检查是否有信号要deliver

由上可见, do_notify_resume() 才会将信号deliver到用户空间.

如果用户没有挂除0处理函数,执行路径:
i386/kernel/signal.c do_notify_resume() --> do_signal() --> get_signal_to_deliver()
在get_signal_to_deliver()中, 用户空间任务直接被do_group_exit() 杀掉.

如果用户挂了除0的处理函数,执行路径:
i386/kernel/signal.c do_notify_resume() --> do_signal() --> handle_signal()
handle_signal()比较复杂, 他要构建用户栈, 修改eip等寄存器, 这样从内核返回后(entry.s中的restore_all),
会直接调用用户空间的信号处理函数. 信号处理函数执行完后, 通过vdso重新恢复上一条指令, vsyscall_sigreturn.S.