以前在写驱动的时候 ,遇到比较多的kernel panic oops 问题,然后
问一些 同事 ,比较多的回答都是加 printk,其实用 GDB 的中的一个功能是能很方便地调试这些问题的。
整理了一下,把自己给一家培训学校写的课件 贴上来。
?第一章 调试
?
?1.1. 工作环境配置
?
?1 )安装好编译用的 kernel-source :RedHat :kernel-devel-xxx.rpm,
? suse:kernel-source-xxx.rpm, 自己编译的 kernel source ;
?2 ) GCC 包, gcc,g++,cpp,
?3) as,ld,objdump,etc
?4) glibc/uclibc
?5) make
?6) gdb
?7 ) SSH 工具 : SSH secure Shell 用于 windows 系统与 Linux 系统之间的文件传输
?8 )串口工具,用于调试拿 log 信息, windos 下用超级终端或者 secureCRT , Linux
?下用 minicom ,C-Kermit
?
?1.2 printk
?在内核中 printk () 的级别定义:
?#define KERN_EMERG "<0>" /* system is unusable */
?#define KERN_ALERT "<1>" /* action must be taken immediately */
?#define KERN_CRIT "<2>" /* critical conditions */
?#define KERN_ERR "<3>" /* error conditions */
?#define KERN_WARNING "<4>" /* warning conditions */
?#define KERN_NOTICE "<5>" /* normal but significant condition */
?#define KERN_INFO "<6>" /* informational */
?#define KERN_DEBUG "<7>" /* debug-level messages */
?通过 /proc/sys/kernel/printk 文件可以调节 printk 的输出级别,
?通过如下命令可以使得 Linux 内核的任何 printk 都被输出:
?#echo 8 > /proc/sys/kernel/printk
?同时设置 grub.conf :
?在 Kernel 这一行加上 : console=tty0,console=ttyS0,115200
?
?1.3 oops 和 panic
?1.3.1 API oops DEBUG
?
?1.3.1.1. 定位 OOPS
?示例: apioops.c:
?
?#include
?#include
?Const char array[]="/x6b/xc0 ";
?int main(int argc, char *argv[])
?{
? printf("%p/n", array);
? *(int *)0 = 0;
?}
?
?1. )编译时打开 complie with debug info 选项 (-g) , 选项
?[root@localhost ~]# gcc -g -o apioops apioops.c
?
?
?2 )执行 api_oops:
?[root@localhost ~]# ./apioops
?
?显示屏输出信息
?0x4005e0
?Segmentation fault
?[root@localhost ~]#
?
?串口输出信息:
?apioops[28910]: segfault at 0000000000000000 rip 00000000004004c0 rsp 00007fff22e15760 error 6
?rip 00000000004004c0 : 表示执行到这个位置是 出错
?
?EIP RIP 值 一般表示代码运行时 ,出错的位置
?
3 )调试
?[root@localhost ~]# gdb apioops
?GNU gdb Fedora (6.8-27.el5)
?Copyright (C) 2008 Free Software Foundation, Inc.
?License GPLv3+: GNU GPL version 3 or later
?This is free software: you are free to change and redistribute it.
?There is NO WARRANTY, to the extent permitted by law. Type "show copying"
?and "show warranty" for details.
?This GDB was configured as "x86_64-redhat-linux-gnu"...
?( gdb )
?
?1.list 调试 RIP 地址:很明显 出错在第 11 行 ,访问空指针
?(gdb) l*0x4004c0
?0x4004c0 is in main (apioops.c:11).
?6
?7 const char array[] = "/x63/x2e";
?8 int main(int argc, char *argv[])
?9 {
?10 printf("%p/n", array);
?11 *(int *)0 = 0;
?12 }
?(gdb)
?
?2. run apioops ,很明显 11 行 出错
?(gdb) r
?Starting program: /root/apioops
?0x4005c8
?Program received signal SIGSEGV, Segmentation fault.
?0x00000000004004c0 in main (argc=1, argv=0x7fffa7020a28) at oops.c:11
?11 *(int *)0 = 0;
?(gdb)
?
?3) 编译时没打开 complie with debug info 选项 (-g) , 选项的调试,或者只有 error 信息没有 代码的调试 。
?1. 运行 run
?(gdb) r
?Starting program: /root/apioops
?(no debugging symbols found)
?(no debugging symbols found)
?0x4005c8
?Program received signal SIGSEGV, Segmentation fault.
?0x00000000004004c0 in main ()
?(gdb)
?
?
?2. 反汇编
?( gdb ) disassemble
?Dump of assembler code for function main:
?0x0000000000400498 : push % rbp
?0x0000000000400499 : mov % rsp,%rbp
?0x000000000040049c : sub $0x10,%rsp
?0x00000000004004a0 : mov %edi,-0x4(% rbp )
?0x00000000004004a3 : mov %rsi,-0x10(% rbp )
?0x00000000004004a7 : mov $0x4005c8,%esi
?0x00000000004004ac : mov $0x4005cb,%edi
?0x00000000004004b1 : mov $0x0,%eax
?0x00000000004004b6 : callq 0x400398 < printf@plt >
?0x00000000004004bb : mov $0x0,%eax
?0x00000000004004c0 : movl $0x0,(% rax )
?0x00000000004004c6 : leaveq
?0x00000000004004c7 : retq
?End of assembler dump.
?( gdb )
?可以看到出错地址
?0x00000000004004c0 : movl $0x0,(% rax )
?表明是这个地址的代码访问了空指针
?
?3. 使用 objdump 反汇编出所有的信息 查看:
?[root@localhost ~]# objdump -d apioops > log
?………………………………….
?0000000000400498 :
? 400498: 55 push %rbp
? 400499: 48 89 e5 mov %rsp,%rbp
? 40049c: 48 83 ec 10 sub $0x10,%rsp
? 4004a0: 89 7d fc mov %edi,0xfffffffffffffffc(%rbp)
? 4004a3: 48 89 75 f0 mov %rsi,0xfffffffffffffff0(%rbp)
? 4004a7: be c8 05 40 00 mov $0x4005c8,%esi
? 4004ac: bf cb 05 40 00 mov $0x4005cb,%edi
? 4004b1: b8 00 00 00 00 mov $0x0,%eax
? 4004b6: e8 dd fe ff ff callq 400398
? 4004bb: b8 00 00 00 00 mov $0x0,%eax
? 4004c0: c7 00 00 00 00 00 movl $0x0,(%rax)
? 4004c6: c9 leaveq
? 4004c7: c3 retq
? 4004c8: 90 nop
?…………………………………………
?可以看到出错地址
?0x00000000004004c0 : movl $0x0,(%rax)
?
?1.3.2.kernel oops debug (以 x86 下为例)
?示例代码: oopsexam.c
?注意编译的时候代开 compile with debug info(-g)
?编译 : make
?创建设备节点: mknod /dev/oopsexam c 251 0
?写操作:
?[root@localhost test]# echo 1 > /dev/oopsexam
?从串口拿到的 Log:
?
?<6>Enter oopsexam_write
?Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP:
? [] :oops:oopsexam_write+0x1c/0x29
?PGD 50ccd067 PUD 50ccc067 PMD 0
?Oops: 0002 [1] SMP
?last sysfs file: /block/sda/dev
?CPU 0
?Modules linked in: oops(FU) ipv6 xfrm_nalgo crypto_api autofs4 hidp rfcomm l2cap bluetooth sunrpc dm_mirror dm_multipath scsi_dh video hwmon backlight sbs i2c_ec button battery asus_acpi acpi_memhotplug ac parport_pc lp parport floppy sg pcspkr i3000_edac edac_mc i2c_i801 i2c_core e1000 serio_raw e1000e dm_raid45 dm_message dm_region_hash dm_log dm_mod dm_mem_cache ata_piix libata shpchp mptsas mptscsih mptbase scsi_transport_sas sd_mod scsi_mod ext3 jbd uhci_hcd ohci_hcd ehci_hcd
?Pid: 28853, comm: bash Tainted: GF 2.6.18-128.el5 #1
?
?
?Pid: 28853, comm: bash Tainted: GF 2.6.18-128.el5 #1
?RIP: 0010:[] [] :oops:oopsexam_write+0x1c/0x29
?RSP: 0018:ffff8100514dbf08 EFLAGS: 00010286
?RAX: 0000000000000002 RBX: 0000000000000002 RCX: ffffffff802f7aa8
?RDX: ffffffff802f7aa8 RSI: 0000000000000000 RDI: ffffffff802f7aa0
?RBP: 0000000000000002 R08: ffffffff802f7aa8 R09: 0000000000000046
?R10: ffff8100514dbc98 R11: ffffffff80161742 R12: 00002ba383367000
?R13: ffff8100514dbf50 R14: 0000000000000000 R15: 0000000000000000
?FS: 00002ba37fb84dc0(0000) GS:ffffffff803ac000(0000) knlGS:0000000000000000
?CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
?CR2: 0000000000000000 CR3: 00000000508c4000 CR4: 00000000000006e0
?Process bash (pid: 28853, threadinfo ffff8100514da000, task ffff810051eb97a0)
?Stack: ffff810068aebd80 ffffffff8001659e c000003e00000001 ffff810068aebd80
? 0000000000000002 fffffffffffffff7 00002ba383367000 ffffffff80016e6b
? 0000000000000000 0000000000000000 0000000000000000 0000000000000002
?Call Trace:
? [] vfs_write+0xce/0x174
? [] sys_write+0x45/0x6e
? [] tracesys+0xd5/0xe0
?Code: c7 04 25 00 00 00 00 01 00 00 00 5b c3 53 48 c7 c6 a7 40 48
?RIP [] :oops:oopsexam_write+0x1c/0x29
? RSP
?CR2: 0000000000000000
? <0>Kernel panic - not syncing: Fatal exception
?
?很关键的地方 :
?RIP [] :oops:oopsexam_write+0x1c/0x29
调试:
?[root@localhost test]# gdb oops.ko
?GNU gdb Fedora (6.8-27.el5)
?Copyright (C) 2008 Free Software Foundation, Inc.
?License GPLv3+: GNU GPL version 3 or later
?This is free software: you are free to change and redistribute it.
?There is NO WARRANTY, to the extent permitted by law. Type "show copying"
?and "show warranty" for details.
?This GDB was configured as "x86_64-redhat-linux-gnu"...
?(gdb)
?1. list 调试:
?(gdb) l*oopsexam_write+0x1c
?0x1c is in oopsexam_write (/home/test/examoops.c:73).
?warning: Source file is more recent than executable.
?68 *off)
?69 {
?70 int *p=0;
?71
?72 printk(KERN_INFO "Enter %s/n",__func__);
?73 *p = 1; //create oops
?74 return len;
?75 }
?76
?77 module_init(oopsexam_init);
?(gdb)
?
?
?
?开发板上 NFS 环境下的 log 类似于以下信息,调试方法同上
?
?[root@utu-linux /test]# echo 1 >/dev/oopsexam
?<6>Enter oopsexam_write
?Unable to handle kernel NULL pointer dereference at virtual address 00000000
?pgd = c3ca4000
?[00000000] *pgd=33c96031, *pte=00000000, *ppte=00000000
?Internal error: Oops: 817 [#1]
?Modules linked in: oopsexam utuled button
?CPU: 0
?PC is at oopsexam_write+0x28/0x38 [oopsexam]
?LR is at 0x1
?pc : [] lr : [<00000001>] Not tainted
?sp : c3c29f3c ip : 60000093 fp : c3c29f4c
?r10: c3c29f78 r9 : c3c28000 r8 : c3c28000
?r7 : 40177000 r6 : c0628300 r5 : 00000000 r4 : 00000002
?r3 : 00000000 r2 : 00000001 r1 : 00004830 r0 : 00000002
?Flags: nZCv IRQs on FIQs on Mode SVC_32 Segment user
?Control: C000717F Table: 33CA4000 DAC: 00000015
?Process echo (pid: 829, stack limit = 0xc3c28194)
?Stack: (0xc3c29f3c to 0xc3c2a000)
?
?Stack: (0xc3c29f3c to 0xc3c2a000)
?9f20: 00000002
?9f40: c3c29f74 c3c29f50 c0070d84 bf004048 c0628324 c0628300 c3c29f78 00000000
?9f60: 00000000 401736bc c3c29fa4 c3c29f78 c0070ebc c0070cd4 00000000 00000000
?9f80: 00000000 00000002 40177000 40171c8c 00000004 c0022024 00000000 c3c29fa8
?9fa0: c0021ea0 c0070e80 00000002 c002877c 00000001 40177000 00000002 00000000
?9fc0: 00000002 40177000 40171c8c 40177000 00000002 0000c06c 401736bc bec36e68
?9fe0: 00000000 bec36dc4 00001920 40113490 60000010 00000001 00646f6d 00000000
?Backtrace:
?[] (oopsexam_write+0x0/0x38 [oopsexam]) from [] (vfs_write+0
?xc0/0x138)
? r4 = 00000002
?[] (vfs_write+0x0/0x138) from [] (sys_write+0x4c/0x74)
?[] (sys_write+0x0/0x74) from [] (ret_fast_syscall+0x0/0x2c)
? r8 = C0022024 r7 = 00000004 r6 = 40171C8C r5 = 40177000
? r4 = 00000002
?Code: eb40d1be e3a02001 e3a03000 e1a00004 (e5832000)
? Segmentation fault
?
?2. objdump
?Objdump –d oopsexam.ko > log
?Vim log : 查看地址: oopsexam_write+0x1c
?0000000000000000 :
? 0: 53 push %rbx
? 1: 48 c7 c6 00 00 00 00 mov $0x0,%rsi
? 8: 48 89 d3 mov %rdx,%rbx
? b: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
? 12: 31 c0 xor %eax,%eax
? 14: e8 00 00 00 00 callq 19
? 19: 48 89 d8 mov %rbx,%rax
? 1c: c7 04 25 00 00 00 00 movl $0x1,0x0
? 23: 01 00 00 00
? 27: 5b pop %rbx
? 28: c3 retq
?很明显 ,往一个空指针 赋值 1
?
?如何找到对应的代码,那就的对应着代码看汇编。