自己創建一個異常
Native Exception,簡稱NE,是發生於C/C++ code裏面最常見的一種異常,
我們在寫代碼的時候一些常見的操作都會導致NE,比如空指針賦值,數組越界訪問等,現在我在代碼裏面人爲的添加一個簡單的exception:
test.c:
1 #include <stdio.h>
2
3 void func4()
4 {
5 char *p = NULL;
6 *p = 0x5;//異常產生的地方
7 }
8
9 void func3()
10 {
11 int var4 = 4;
12 }
13
14
15 void func2()
16 {
17 int var3 = 3;
18 func3();
19 func4();
20 }
21
22 void func1()
23 {
24 int var1,var2;
25 var1 = 2;
26 }
27
28 void main()
29 {
30 int var0 = 1;
31 func1();
32 func2();
33 return;
34 }
Android.mk
1 LOCAL_PATH := $(call my-dir)
2 include $(CLEAR_VARS)
3
4 LOCAL_CFLAGS += -g3 -O0
5
6 LOCAL_SRC_FILES := test.c
7
8 LOCAL_MODULE := test
9
10 LOCAL_MULTILIB := 32
11
12 include $(BUILD_EXECUTABLE)
我把它放入pls/vendor/mediatek/proprietary/external/libtest/目錄下面,我們對它進行編譯並push到手機裏面:
mmm vendor/mediatek/proprietary/external/libtest/
out/target/product/xxx/system/bin/test
adb push out/target/product/xxx/system/bin/test system/bin/
gdb-server調試程序
啓動gdbserver:
$ adb shell ./system/bin/gdbserver :1234 system/bin/test
Process system/bin/test created; pid = 4130
Listening on port 1234
$ adb forward tcp:1234 tcp:1234
gdb 調試這個bin文件:
$ ./prebuilts/gcc/linux-x86/arm/cit-arm-linux-androideabi-4.8/bin/arm-linux-androideabi-gdb out/target/product/xxx/symbols/system/bin/test
Reading symbols from out/target/product/xxx/symbols/system/bin/test...done.
(gdb) set solib-search-path out/target/product/xxx/symbols/system/lib/
(gdb) set solib-absolute-prefix out/target/product/xxx/symbols/
(gdb) target remote:1234
Remote debugging using :1234
Reading symbols from out/target/product/xxx/symbols/system/bin/linker...done.
Loaded symbols for out/target/product/xxx/symbols/system/bin/linker
__dl__start () at bionic/linker/arch/arm/begin.S:32
32 mov r0, sp
(gdb) list
現在test程序加載 成功了:
(gdb) b main //設置斷點
Breakpoint 1 at 0xaaaaa772: file vendor/mediatek/proprietary/external/libtest/test.c, line 30.
(gdb) n //單步執行
33 bl __linker_init
當運行到func函數裏面就出現異常:
(gdb) n
Program received signal SIGSEGV, Segmentation fault.
0xaaaaa740 in func4 () at vendor/mediatek/proprietary/external/libtest/test.c:6
6 *p = 0x5;
可以很清楚的知道,我們在調用libtest這裏面出現了問題,AEE是MTK平臺自己的一套處理異常的工具,代碼是封裝好的,當應用app發生了異常,它回收集異常信息到壓縮在DB文件裏面,我們需要用GAT工具才能打開這個文件,通過在main_log裏面,我們可以搜索到如下信息:
01-02 04:16:18.768 4180 4180 I AEE_AED : Build fingerprint: 'xxx:7.0/NRD90M/v6H5E-2:eng/test-keys'
01-02 04:16:18.768 4180 4180 I AEE_AED : Revision: '0'
01-02 04:16:18.768 4180 4180 I AEE_AED : ABI: 'arm'
01-02 04:16:18.768 4180 4180 I AEE_AED : pid: 4142, tid: 4142, name: test >>> system/bin/test <<<
01-02 04:16:18.769 4180 4180 I AEE_AED : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x0
01-02 04:16:18.770 4180 4180 I AEE_AED : r0 00000000 r1 00000005 r2 fffefa2c r3 00000000
01-02 04:16:18.770 4180 4180 I AEE_AED : r4 aaaaa76f r5 fffefa24 r6 00000001 r7 fffefa2c
01-02 04:16:18.771 4180 4180 I AEE_AED : r8 00000000 r9 00000000 sl 00000000 fp fffefa00
01-02 04:16:18.771 4180 4180 I AEE_AED : ip f750085c sp fffef9cc lr aaaaa761 pc aaaaa740 cpsr 00070030
01-02 04:16:18.796 4180 4180 I AEE_AED :
01-02 04:16:18.796 4180 4180 I AEE_AED : backtrace:
01-02 04:16:18.799 4180 4180 I AEE_AED : #00 pc 00000740 /system/bin/test
01-02 04:16:18.800 4180 4180 I AEE_AED : #01 pc 0000075d /system/bin/test
01-02 04:16:18.800 4180 4180 I AEE_AED : #02 pc 0000077b /system/bin/test
01-02 04:16:18.800 4180 4180 I AEE_AED : #03 pc 0001708c /system/lib/libc.so (__libc_init+84)
01-02 04:16:18.800 4180 4180 I AEE_AED : #04 pc 00000660 /system/bin/test
01-02 04:16:18.819 290 290 I wmt_launcher: fw log ctrl flag has been set
當native層程序發生異常的時候,系統kernel就會進入異常模式會發送一個signal給到usr這邊,處理這個異常的signal就是android的debuggerd這個進程,會在log當中找到類似如下log:
libc : Fatal signal 11 (SIGSEGV), code 1, fault addr 0x14 in tid 9765 (Capture@CmdQue)
此進程可以偵測到程序崩潰,並將崩潰時的進程狀態信息輸出到文件和串口中,以供開發人員分析調試使用。Debuggerd的數據被保存在/data/tombstone/目錄下,Linux kernel有自己的一套signal機制,在應用程序崩潰時,通常系統內核都會發送signal到出問題的進程,以通知進程出現什麼異常,這些進程可以捕獲這些signal並對其做相應的處理。
debuggerd創建一個名爲 “Android:debuggerd”的socket,作爲server端等待其他client端進程的連接,接收client端進程發送來的tid和action信息將由tid指定的那個進程的運行信息,按照由action指定的動作dump到文件;
c/c++程序clinet端
下面就將簡單介紹debuggerd進程的處理過程:
在應用程序入口地址__start後,__linker_init中調用debugger_init()函數來註冊異常信號處理handler,以實現攔截系統異常的幾個singal:SIGILL,SIGABRT, SIGBUS, SIGFPE,SIGSEGV和SIGPIPE:
bionic/linker/linker.cpp:
4172static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
4173#if TIMING
4174 struct timeval t0, t1;
4175 gettimeofday(&t0, 0);
4176#endif
4179 __libc_init_AT_SECURE(args);
4180
4184 debuggerd_init();
bionic/linker/debugger.cpp:
302__LIBC_HIDDEN__ void debuggerd_init() {
303 struct sigaction action;
304 memset(&action, 0, sizeof(action));
305 sigemptyset(&action.sa_mask);
306 action.sa_sigaction = debuggerd_signal_handler;//異常處理函數;
307 action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309 // Use the alternate signal stack if available so we can catch stack overflows.
310 action.sa_flags |= SA_ONSTACK;
311
312 sigaction(SIGABRT, &action, nullptr);
313 sigaction(SIGBUS, &action, nullptr);
314 sigaction(SIGFPE, &action, nullptr);
315 sigaction(SIGILL, &action, nullptr);
316 sigaction(SIGSEGV, &action, nullptr);
317#if defined(SIGSTKFLT)
318 sigaction(SIGSTKFLT, &action, nullptr);
319#endif
320 sigaction(SIGTRAP, &action, nullptr);
321}
bionic庫中的鏈接器會對以下七種信號設置Handler(debugger_signal_handler):
SIGILL(非法指令異常)//前面對空指針賦值就,內核那邊就發送這個信號給進程cameraserver
SIGABRT(abort退出異常)
SIGBUS(硬件訪問異常)
SIGFPE(浮點運算異常)
SIGSEGV(內存訪問異常)
SIGSTKFLT(協處理器棧異常)
SIGPIPE(管道異常
262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
263 // It's possible somebody cleared the SA_SIGINFO flag, which would mean
264 // our "info" arg holds an undefined value.
265 if (!have_siginfo(signal_number)) {
266 info = nullptr;
267 }
268
269 log_signal_summary(signal_number, info);//打印出現問題進程信息;
270
271 send_debuggerd_packet(info);//現在處於clinet端,通過socket跟service 進行connect,
//然後通過write(s, &msg, sizeof(msg)把info發給debuggerd,DEBUGGER_ACTION_CRASH爲採取的行爲;
272
273 // We need to return from the signal handler so that debuggerd can dump the
274 // thread that crashed, but returning here does not guarantee that the signal
275 // will be thrown again, even for SIGSEGV and friends, since the signal could
276 // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to
277 // preserve the SA_SIGINFO contents.
278 signal(signal_number, SIG_DFL);//設置該信號關聯的動作,SIG_DFL表示默認操作,恢復到默認;
279
280 struct siginfo si;
281 if (!info) {
282 memset(&si, 0, sizeof(si));
283 si.si_code = SI_USER;
284 si.si_pid = getpid();
285 si.si_uid = getuid();
286 info = &si;
287 } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
288 // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
289 // that contain commit 66dd34a (3.9+). The manpage claims to only allow
290 // negative si_code values that are not SI_TKILL, but 66dd34a changed the
291 // check to allow all si_code values in calls coming from inside the house.
292 }
293
294 int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);
//系統調用tgsigqueueinfo:信號將被傳遞給線程組的任意成員;
295 if (rc != 0) {
296 __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297 strerror(errno));
298 _exit(0);
299 }
300}
debuggered進程service端:
system/core/debuggerd/debuggerd.cpp
871int main(int argc, char** argv) {
872 union selinux_callback cb;
873 if (argc == 1) {
874 cb.func_audit = audit_callback;
875 selinux_set_callback(SELINUX_CB_AUDIT, cb);
876 cb.func_log = selinux_log_callback;
877 selinux_set_callback(SELINUX_CB_LOG, cb);
878 return do_server();//沒有-b參數就調用這個流程
879 }
880
895 if (!have_tid) {
896 usage();
897 return 1;
898 }
899 return do_explicit_dump(tid, dump_backtrace);//手動導出 debuggerd -b tid
900}
當啓動debuggerd進程傳遞的參數個數爲1時,debuggerd將作爲一個後臺服務進程,專門接收應用程序異常退出消息而產生tombstone:
792static int do_server() {
793 // debuggerd crashes can't be reported to debuggerd.
794 // Reset all of the crash handlers.
//忽略debuggerd自身crash的處理;
795 signal(SIGABRT, SIG_DFL);
796 signal(SIGBUS, SIG_DFL);
797 signal(SIGFPE, SIG_DFL);
798 signal(SIGILL, SIG_DFL);
799 signal(SIGSEGV, SIG_DFL);
800#ifdef SIGSTKFLT
801 signal(SIGSTKFLT, SIG_DFL);
802#endif
803 signal(SIGTRAP, SIG_DFL);
804
805 // Ignore failed writes to closed sockets
806 signal(SIGPIPE, SIG_IGN);
807
808 // Block SIGCHLD so we can sigtimedwait for it.
809 sigset_t sigchld;
810 sigemptyset(&sigchld);
811 sigaddset(&sigchld, SIGCHLD);
812 sigprocmask(SIG_SETMASK, &sigchld, nullptr);
813 //建立socket通信的server端;
814 int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,
815 SOCK_STREAM | SOCK_CLOEXEC);
816 if (s == -1) return 1;
817
818 // Fork a process that stays root, and listens on a pipe to pause and resume the target.
819 if (!start_signal_sender()) {
820 ALOGE("debuggerd: failed to fork signal sender");
821 return 1;
822 }
823
824 ALOGI("debuggerd: starting\n");
825
826 for (;;) {
827 sockaddr_storage ss;
828 sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);
829 socklen_t alen = sizeof(ss);
830
831 ALOGV("waiting for connection\n");
832 int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);
833 if (fd == -1) {
834 ALOGE("accept failed: %s\n", strerror(errno));
835 continue;
836 }
837
838 handle_request(fd);//handle_request 處理請求;
839 }
840 return 0;
841}
system/core/debuggerd/debuggerd.cpp:
751static void handle_request(int fd) {
752 ALOGV("handle_request(%d)\n", fd);
753
754 ScopedFd closer(fd);
755 debugger_request_t request;
756 memset(&request, 0, sizeof(request));
757 int status = read_request(fd, &request);
//讀取client端進程發送來的數據,socket上讀取debugger_msg_t結構體;
758 if (status != 0) {
759 return;
760 }
781 // Fork a child to handle the rest of the request.
782 pid_t fork_pid = fork();
783 if (fork_pid == -1) {
784 ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
785 } else if (fork_pid == 0) {
786 worker_process(fd, request);//創建一個子進程去處理dump的工作;
787 } else {
788 monitor_worker_process(fork_pid, request);//父進程監控子進程操作,結束後就會殺敵子進程;
789 }
790}
先看子進程的操作:system/core/debuggerd/debuggerd.cpp:
537static void worker_process(int fd, debugger_request_t& request) {
538 // Open the tombstone file if we need it.
539 std::string tombstone_path;
540 int tombstone_fd = -1;
541 switch (request.action) {
542 case DEBUGGER_ACTION_DUMP_TOMBSTONE:
543 case DEBUGGER_ACTION_CRASH:
544 tombstone_fd = open_tombstone(&tombstone_path);
553 //打開一個tombstone文件,限制最多10個,超過了就會被覆蓋掉;
554 default:
555 ALOGE("debuggerd: unexpected request action: %d", request.action);
556 exit(1);
557 }
569
570 // Attach to the target process.
571 if (ptrace(PTRACE_ATTACH, request.tid, 0, 0) != 0) {
//跟蹤指定進程,成爲它的父進程,並停止該進程,debuggerd可也攔截髮送給這個thread的信號除了
//SIGKILL,所以現在kernel那邊發送過來的信號將被debuggered攔截;
//ATTACH之後,會讓kernel那邊發送SIGSTOP信號給原來問題進程,這個信號將被debuggerd攔截;
572 ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
573 exit(1);
574 }
575
576 // Don't attach to the sibling threads if we want to attach gdb.
577 // Supposedly, it makes the process less reliable.
578 bool attach_gdb = should_attach_gdb(request);
587 //是否調用gdb調試,是就會終止正常的crash
588 std::set<pid_t> siblings;
589 if (!attach_gdb) {
590 ptrace_siblings(request.pid, request.tid, siblings);
//同時跟蹤問題thread相關聯的thread;
591 }
592
593 // Generate the backtrace map before dropping privileges.
594 std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid));
595 //生成backtrace map;
596 int amfd = -1;
597 std::unique_ptr<std::string> amfd_data;
598 if (request.action == DEBUGGER_ACTION_CRASH) {
599 // Connect to the activity manager before dropping privileges.
600 amfd = activity_manager_connect();
601 amfd_data.reset(new std::string);
602 }
603
604 bool succeeded = false;
605
606 // Now that we've done everything that requires privileges, we can drop them.
607 if (!drop_privileges()) {
608 ALOGE("debuggerd: failed to drop privileges, exiting");
609 _exit(1);
610 }
611
612 int crash_signal = SIGKILL;
613 succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,
614 &crash_signal, amfd_data.get());
//根據sinal信號類型然後通過engrave_tombstone把信息寫到tombstone;
615 if (succeeded) {
616 if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
617 if (!tombstone_path.empty()) {
618 android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length());
619 }
620 }
621 }
631 if (!attach_gdb) {
632 // Tell the Activity Manager about the crashing process. If we are
633 // waiting for gdb to attach, do not send this or Activity Manager
634 // might kill the process before anyone can attach.
635 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
636 }
637 //解除對問題tread的跟蹤;
638 if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) {
639 ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));
640 }
641 //解除對問題相關聯tread的跟蹤;
642 for (pid_t sibling : siblings) {
643 ptrace(PTRACE_DETACH, sibling, 0, 0);
644 }
645
646 // Send the signal back to the process if it crashed and we're not waiting for gdb.
647 if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {
648 if (!send_signal(request.pid, request.tid, crash_signal)) {
649 ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));
650 }
651 }
667
668 close(amfd);
669
670 exit(!succeeded);
671}
455static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
456 BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,
457 int* crash_signal, std::string* amfd_data) {
458 if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) {
459 ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));
460 return false;
461 }
462
463 int total_sleep_time_usec = 0;
464 while (true) {
465 int signal = wait_for_signal(request.tid, &total_sleep_time_usec);
//第一次發送等到的是stop信號,第二次纔是出現問題類型的真正信號;
466 switch (signal) {
467 case -1:
468 ALOGE("debuggerd: timed out waiting for signal");
469 return false;
470
471 case SIGSTOP:
480 ALOGV("debuggerd: stopped -- continuing");
481 if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {//將目標問題進程切換位出現問題時刻的上下文狀態;
482 ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));
483 return false;
484 }
485 continue; // loop again
486 }
487 break;
488
489 case SIGABRT:
490 case SIGBUS:
491 case SIGFPE:
492 case SIGILL:
493 case SIGSEGV:
494#ifdef SIGSTKFLT
495 case SIGSTKFLT:
496#endif
497 case SIGSYS:
498 case SIGTRAP:
499 ALOGV("stopped -- fatal signal\n");
500 *crash_signal = signal;//當在一次信號過來,就會通過下面的函數導出此刻問題進程的信息;
501 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
502 request.original_si_code, request.abort_msg_address, amfd_data);
503 break;
504
505 default:
506 ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
507 break;
508 }
509 break;
510 }
511
512 return true;
513}
如下就是從tombstone導出來的信息:
*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
Build fingerprint: '
xxx/5049S/xxx:7.0/NRD90M/xxx:eng/test-keys'
Revision: '0'
ABI: 'arm'
pid: 505, tid: 1046, name: Binder:505_1 >>> /system/bin/cameraserver <<<
signal 4 (SIGILL), code 1 (ILL_ILLOPC), fault addr 0xe09a4988
r0 e8d1a2c8 r1 e09a8004 r2 00000001 r3 00000002
r4 e1106200 r5 00000001 r6 de83b891 r7 e8d438d0
r8 00000416 r9 e8d39990 sl e930bf6d fp e127f910
ip de855c1c sp e127f678 lr de83aba1 pc e09a4988 cpsr 200f0030
backtrace:
#00 pc 00000988 /system/vendor/lib/libcancer.so (_ZN7android6Cancer15destroyInstanceEv+39)
#01 pc 0001db9d /system/vendor/lib/libcam.client.so (_ZN7android15NSDisplayClient13DisplayClient4initEv+60)
#02 pc 0000d145 /system/vendor/lib/libcam.device1.so (_ZN7android14Cam1DeviceBase17initDisplayClientEP18preview_stream_ops+684)
整個tombstone包含的信息有:
(1). 創建1個tombstone文件。
最多10個,如果已存在10個,則覆蓋最舊的文件。
(2). 版本信息
主要是fingerprint,可以看出異常版本是eng還是user。
(3). 寄存器信息
主要查看是哪個進程崩潰,信號是什麼。寄存器信息需要配合下面的調用棧信息及數據信息結合GNU的工具(objdump -S反彙編)分析。
(4). 調用棧信息
這個是最直接可以看出異常的信息。
(5). 其他線程信息
如果異常線程和其他線程有邏輯關係的話,可以查看對應線程的信息。
(6). main log信息
最後添加一張流程圖: