Native Exception产生

自己创建一个异常

Native Exception，简称NE，是发生于C/C++ code里面最常见的一种异常,

我们在写代码的时候一些常见的操作都会导致NE，比如空指针赋值，数组越界访问等，现在我在代码里面人为的添加一个简单的exception:

test.c:

  1 #include <stdio.h>
  2 
  3 void func4()
  4 {
  5     char *p = NULL;
  6     *p = 0x5;//异常产生的地方
  7 }
  8 
  9 void func3()
 10 {
 11     int var4 = 4;
 12 }
 13 
 14 
 15 void func2()
 16 {
 17     int var3 = 3;
 18     func3();
 19     func4();
 20 }
 21 
 22 void func1()
 23 {
 24     int var1,var2;
 25     var1 = 2;
 26 }
 27 
 28 void main()
 29 {
 30     int var0 = 1;
 31     func1();
 32     func2();
 33     return;
 34 }

Android.mk

 1 LOCAL_PATH := $(call my-dir)
  2 include $(CLEAR_VARS)
  3 
  4 LOCAL_CFLAGS += -g3 -O0
  5     
  6 LOCAL_SRC_FILES := test.c
  7 
  8 LOCAL_MODULE := test
  9 
 10 LOCAL_MULTILIB := 32
 11     
 12 include $(BUILD_EXECUTABLE)

我把它放入pls/vendor/mediatek/proprietary/external/libtest/目录下面，我们对它进行编译并push到手机里面:

mmm vendor/mediatek/proprietary/external/libtest/
out/target/product/xxx/system/bin/test
adb push out/target/product/xxx/system/bin/test  system/bin/

gdb-server调试程序

启动gdbserver：

$ adb shell ./system/bin/gdbserver :1234 system/bin/test
    Process system/bin/test created; pid = 4130
    Listening on port 1234

$ adb forward tcp:1234 tcp:1234

gdb 调试这个bin文件：

$ ./prebuilts/gcc/linux-x86/arm/cit-arm-linux-androideabi-4.8/bin/arm-linux-androideabi-gdb out/target/product/xxx/symbols/system/bin/test

Reading symbols from out/target/product/xxx/symbols/system/bin/test...done.
(gdb) set solib-search-path out/target/product/xxx/symbols/system/lib/
(gdb) set solib-absolute-prefix out/target/product/xxx/symbols/
(gdb) target remote:1234
Remote debugging using :1234
Reading symbols from out/target/product/xxx/symbols/system/bin/linker...done.
Loaded symbols for out/target/product/xxx/symbols/system/bin/linker
__dl__start () at bionic/linker/arch/arm/begin.S:32
32    mov r0, sp
(gdb) list

现在test程序加载成功了：

(gdb) b main //设置断点
Breakpoint 1 at 0xaaaaa772: file vendor/mediatek/proprietary/external/libtest/test.c, line 30.
(gdb) n //单步执行
33    bl __linker_init

当运行到func函数里面就出现异常：

(gdb) n
Program received signal SIGSEGV, Segmentation fault.
0xaaaaa740 in func4 () at vendor/mediatek/proprietary/external/libtest/test.c:6
6       *p = 0x5;

可以很清楚的知道，我们在调用libtest这里面出现了问题，AEE是MTK平台自己的一套处理异常的工具，代码是封装好的,当应用app发生了异常，它回收集异常信息到压缩在DB文件里面，我们需要用GAT工具才能打开这个文件，通过在main_log里面，我们可以搜索到如下信息：

01-02 04:16:18.768  4180  4180 I AEE_AED : Build fingerprint: 'xxx:7.0/NRD90M/v6H5E-2:eng/test-keys'
01-02 04:16:18.768  4180  4180 I AEE_AED : Revision: '0'
01-02 04:16:18.768  4180  4180 I AEE_AED : ABI: 'arm'
01-02 04:16:18.768  4180  4180 I AEE_AED : pid: 4142, tid: 4142, name: test  >>> system/bin/test <<<
01-02 04:16:18.769  4180  4180 I AEE_AED : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x0
01-02 04:16:18.770  4180  4180 I AEE_AED :     r0 00000000  r1 00000005  r2 fffefa2c  r3 00000000
01-02 04:16:18.770  4180  4180 I AEE_AED :     r4 aaaaa76f  r5 fffefa24  r6 00000001  r7 fffefa2c
01-02 04:16:18.771  4180  4180 I AEE_AED :     r8 00000000  r9 00000000  sl 00000000  fp fffefa00
01-02 04:16:18.771  4180  4180 I AEE_AED :     ip f750085c  sp fffef9cc  lr aaaaa761  pc aaaaa740  cpsr 00070030
01-02 04:16:18.796  4180  4180 I AEE_AED : 
01-02 04:16:18.796  4180  4180 I AEE_AED : backtrace:
01-02 04:16:18.799  4180  4180 I AEE_AED :     #00 pc 00000740  /system/bin/test
01-02 04:16:18.800  4180  4180 I AEE_AED :     #01 pc 0000075d  /system/bin/test
01-02 04:16:18.800  4180  4180 I AEE_AED :     #02 pc 0000077b  /system/bin/test
01-02 04:16:18.800  4180  4180 I AEE_AED :     #03 pc 0001708c  /system/lib/libc.so (__libc_init+84)
01-02 04:16:18.800  4180  4180 I AEE_AED :     #04 pc 00000660  /system/bin/test
01-02 04:16:18.819   290   290 I wmt_launcher: fw log ctrl flag has been set

当native层程序发生异常的时候，系统kernel就会进入异常模式会发送一个signal给到usr这边，处理这个异常的signal就是android的debuggerd这个进程，会在log当中找到类似如下log：

libc    : Fatal signal 11 (SIGSEGV), code 1, fault addr 0x14 in tid 9765 (Capture@CmdQue)

此进程可以侦测到程序崩溃，并将崩溃时的进程状态信息输出到文件和串口中，以供开发人员分析调试使用。Debuggerd的数据被保存在/data/tombstone/目录下，Linux kernel有自己的一套signal机制，在应用程序崩溃时，通常系统内核都会发送signal到出问题的进程，以通知进程出现什么异常，这些进程可以捕获这些signal并对其做相应的处理。

debuggerd创建一个名为 “Android:debuggerd”的socket，作为server端等待其他client端进程的连接，接收client端进程发送来的tid和action信息将由tid指定的那个进程的运行信息，按照由action指定的动作dump到文件；

c/c++程序clinet端

下面就将简单介绍debuggerd进程的处理过程：

在应用程序入口地址__start后，__linker_init中调用debugger_init()函数来注册异常信号处理handler，以实现拦截系统异常的几个singal：SIGILL,SIGABRT, SIGBUS, SIGFPE,SIGSEGV和SIGPIPE：

bionic/linker/linker.cpp：

4172static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
4173#if TIMING
4174  struct timeval t0, t1;
4175  gettimeofday(&t0, 0);
4176#endif

4179  __libc_init_AT_SECURE(args);
4180
4184  debuggerd_init();

bionic/linker/debugger.cpp：

302__LIBC_HIDDEN__ void debuggerd_init() {
303  struct sigaction action;
304  memset(&action, 0, sizeof(action));
305  sigemptyset(&action.sa_mask);
306  action.sa_sigaction = debuggerd_signal_handler;//异常处理函数；
307  action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309  // Use the alternate signal stack if available so we can catch stack overflows.
310  action.sa_flags |= SA_ONSTACK;
311
312  sigaction(SIGABRT, &action, nullptr);
313  sigaction(SIGBUS, &action, nullptr);
314  sigaction(SIGFPE, &action, nullptr);
315  sigaction(SIGILL, &action, nullptr);
316  sigaction(SIGSEGV, &action, nullptr);
317#if defined(SIGSTKFLT)
318  sigaction(SIGSTKFLT, &action, nullptr);
319#endif
320  sigaction(SIGTRAP, &action, nullptr);
321}

bionic库中的链接器会对以下七种信号设置Handler(debugger_signal_handler)：

    SIGILL(非法指令异常)//前面对空指针赋值就，内核那边就发送这个信号给进程cameraserver
    SIGABRT(abort退出异常)
    SIGBUS(硬件访问异常)
    SIGFPE(浮点运算异常)
    SIGSEGV(内存访问异常)
    SIGSTKFLT(协处理器栈异常)
    SIGPIPE(管道异常

262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
263  // It's possible somebody cleared the SA_SIGINFO flag, which would mean
264  // our "info" arg holds an undefined value.
265  if (!have_siginfo(signal_number)) {
266    info = nullptr;
267  }
268
269  log_signal_summary(signal_number, info);//打印出现问题进程信息；
270
271  send_debuggerd_packet(info);//现在处于clinet端，通过socket跟service 进行connect，
     //然后通过write(s, &msg, sizeof(msg)把info发给debuggerd，DEBUGGER_ACTION_CRASH为采取的行为；
272
273  // We need to return from the signal handler so that debuggerd can dump the
274  // thread that crashed, but returning here does not guarantee that the signal
275  // will be thrown again, even for SIGSEGV and friends, since the signal could
276  // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to
277  // preserve the SA_SIGINFO contents.
278  signal(signal_number, SIG_DFL);//设置该信号关联的动作，SIG_DFL表示默认操作，恢复到默认；
279
280  struct siginfo si;
281  if (!info) {
282    memset(&si, 0, sizeof(si));
283    si.si_code = SI_USER;
284    si.si_pid = getpid();
285    si.si_uid = getuid();
286    info = &si;
287  } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
288    // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
289    // that contain commit 66dd34a (3.9+). The manpage claims to only allow
290    // negative si_code values that are not SI_TKILL, but 66dd34a changed the
291    // check to allow all si_code values in calls coming from inside the house.
292  }
293
294  int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);
//系统调用tgsigqueueinfo：信号将被传递给线程组的任意成员；
295  if (rc != 0) {
296    __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297                      strerror(errno));
298    _exit(0);
299  }
300}

debuggered进程service端：

system/core/debuggerd/debuggerd.cpp

871int main(int argc, char** argv) {
872  union selinux_callback cb;
873  if (argc == 1) {
874    cb.func_audit = audit_callback;
875    selinux_set_callback(SELINUX_CB_AUDIT, cb);
876    cb.func_log = selinux_log_callback;
877    selinux_set_callback(SELINUX_CB_LOG, cb);
878    return do_server();//没有-b参数就调用这个流程
879  }
880
895  if (!have_tid) {
896    usage();
897    return 1;
898  }
899  return do_explicit_dump(tid, dump_backtrace);//手动导出 debuggerd -b tid 
900}

当启动debuggerd进程传递的参数个数为1时，debuggerd将作为一个后台服务进程，专门接收应用程序异常退出消息而产生tombstone：

792static int do_server() {
793  // debuggerd crashes can't be reported to debuggerd.
794  // Reset all of the crash handlers.
     //忽略debuggerd自身crash的处理；
795  signal(SIGABRT, SIG_DFL);
796  signal(SIGBUS, SIG_DFL);
797  signal(SIGFPE, SIG_DFL);
798  signal(SIGILL, SIG_DFL);
799  signal(SIGSEGV, SIG_DFL);
800#ifdef SIGSTKFLT
801  signal(SIGSTKFLT, SIG_DFL);
802#endif
803  signal(SIGTRAP, SIG_DFL);
804
805  // Ignore failed writes to closed sockets
806  signal(SIGPIPE, SIG_IGN);
807
808  // Block SIGCHLD so we can sigtimedwait for it.
809  sigset_t sigchld;
810  sigemptyset(&sigchld);
811  sigaddset(&sigchld, SIGCHLD);
812  sigprocmask(SIG_SETMASK, &sigchld, nullptr);
813  //建立socket通信的server端；
814  int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,
815                              SOCK_STREAM | SOCK_CLOEXEC);
816  if (s == -1) return 1;
817
818  // Fork a process that stays root, and listens on a pipe to pause and resume the target.
819  if (!start_signal_sender()) {
820    ALOGE("debuggerd: failed to fork signal sender");
821    return 1;
822  }
823
824  ALOGI("debuggerd: starting\n");
825
826  for (;;) {
827    sockaddr_storage ss;
828    sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);
829    socklen_t alen = sizeof(ss);
830
831    ALOGV("waiting for connection\n");
832    int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);
833    if (fd == -1) {
834      ALOGE("accept failed: %s\n", strerror(errno));
835      continue;
836    }
837
838    handle_request(fd);//handle_request 处理请求；
839  }
840  return 0;
841}

system/core/debuggerd/debuggerd.cpp：

751static void handle_request(int fd) {
752  ALOGV("handle_request(%d)\n", fd);
753
754  ScopedFd closer(fd);
755  debugger_request_t request;
756  memset(&request, 0, sizeof(request));
757  int status = read_request(fd, &request);
     //读取client端进程发送来的数据,socket上读取debugger_msg_t结构体;
758  if (status != 0) {
759    return;
760  }
781  // Fork a child to handle the rest of the request.
782  pid_t fork_pid = fork();
783  if (fork_pid == -1) {
784    ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
785  } else if (fork_pid == 0) {
786    worker_process(fd, request);//创建一个子进程去处理dump的工作；
787  } else {
788    monitor_worker_process(fork_pid, request);//父进程监控子进程操作，结束后就会杀敌子进程；
789  }
790}

先看子进程的操作：system/core/debuggerd/debuggerd.cpp：

537static void worker_process(int fd, debugger_request_t& request) {
538  // Open the tombstone file if we need it.
539  std::string tombstone_path;
540  int tombstone_fd = -1;
541  switch (request.action) {
542    case DEBUGGER_ACTION_DUMP_TOMBSTONE:
543    case DEBUGGER_ACTION_CRASH:
544      tombstone_fd = open_tombstone(&tombstone_path);
553     //打开一个tombstone文件，限制最多10个，超过了就会被覆盖掉；
554    default:
555      ALOGE("debuggerd: unexpected request action: %d", request.action);
556      exit(1);
557  }
569
570  // Attach to the target process.
571  if (ptrace(PTRACE_ATTACH, request.tid, 0, 0) != 0) {
     //跟踪指定进程,成为它的父进程,并停止该进程，debuggerd可也拦截发送给这个thread的信号除了 
     //SIGKILL，所以现在kernel那边发送过来的信号将被debuggered拦截;
     //ATTACH之后,会让kernel那边发送SIGSTOP信号给原来问题进程，这个信号将被debuggerd拦截；
572    ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
573    exit(1);
574  }
575
576  // Don't attach to the sibling threads if we want to attach gdb.
577  // Supposedly, it makes the process less reliable.
578  bool attach_gdb = should_attach_gdb(request);
587  //是否调用gdb调试，是就会终止正常的crash
588  std::set<pid_t> siblings;
589  if (!attach_gdb) {
590    ptrace_siblings(request.pid, request.tid, siblings);
       //同时跟踪问题thread相关联的thread；
591  }
592
593  // Generate the backtrace map before dropping privileges.
594  std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid));
595 //生成backtrace map；
596  int amfd = -1;
597  std::unique_ptr<std::string> amfd_data;
598  if (request.action == DEBUGGER_ACTION_CRASH) {
599    // Connect to the activity manager before dropping privileges.
600    amfd = activity_manager_connect();
601    amfd_data.reset(new std::string);
602  }
603
604  bool succeeded = false;
605
606  // Now that we've done everything that requires privileges, we can drop them.
607  if (!drop_privileges()) {
608    ALOGE("debuggerd: failed to drop privileges, exiting");
609    _exit(1);
610  }
611
612  int crash_signal = SIGKILL;
613  succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,
614                           &crash_signal, amfd_data.get());
     //根据sinal信号类型然后通过engrave_tombstone把信息写到tombstone；
615  if (succeeded) {
616    if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
617      if (!tombstone_path.empty()) {
618        android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length());
619      }
620    }
621  }

631  if (!attach_gdb) {
632    // Tell the Activity Manager about the crashing process. If we are
633    // waiting for gdb to attach, do not send this or Activity Manager
634    // might kill the process before anyone can attach.
635    activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
636  }
637  //解除对问题tread的跟踪；
638  if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) {
639    ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));
640  }
641  //解除对问题相关联tread的跟踪；
642  for (pid_t sibling : siblings) {
643    ptrace(PTRACE_DETACH, sibling, 0, 0);
644  }
645
646  // Send the signal back to the process if it crashed and we're not waiting for gdb.
647  if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {
648    if (!send_signal(request.pid, request.tid, crash_signal)) {
649      ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));
650    }
651  }
667
668  close(amfd);
669
670  exit(!succeeded);
671}

455static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
456                         BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,
457                         int* crash_signal, std::string* amfd_data) {
458  if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) {
459    ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));
460    return false;
461  }
462
463  int total_sleep_time_usec = 0;
464  while (true) {
465    int signal = wait_for_signal(request.tid, &total_sleep_time_usec);
       //第一次发送等到的是stop信号，第二次才是出现问题类型的真正信号；
466    switch (signal) {
467      case -1:
468        ALOGE("debuggerd: timed out waiting for signal");
469        return false;
470
471      case SIGSTOP:
480          ALOGV("debuggerd: stopped -- continuing");
481          if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {//将目标问题进程切换位出现问题时刻的上下文状态；
482            ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));
483            return false;
484          }
485          continue;  // loop again
486        }
487        break;
488
489      case SIGABRT:
490      case SIGBUS:
491      case SIGFPE:
492      case SIGILL:
493      case SIGSEGV:
494#ifdef SIGSTKFLT
495      case SIGSTKFLT:
496#endif
497      case SIGSYS:
498      case SIGTRAP:
499        ALOGV("stopped -- fatal signal\n");
500        *crash_signal = signal;//当在一次信号过来,就会通过下面的函数导出此刻问题进程的信息；
501        engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
502                          request.original_si_code, request.abort_msg_address, amfd_data);
503        break;
504
505      default:
506        ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
507        break;
508    }
509    break;
510  }
511
512  return true;
513}

如下就是从tombstone导出来的信息：

*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
Build fingerprint: '
xxx/5049S/xxx:7.0/NRD90M/xxx:eng/test-keys'
Revision: '0'
ABI: 'arm'
pid: 505, tid: 1046, name: Binder:505_1  >>> /system/bin/cameraserver <<<
signal 4 (SIGILL), code 1 (ILL_ILLOPC), fault addr 0xe09a4988
    r0 e8d1a2c8  r1 e09a8004  r2 00000001  r3 00000002
    r4 e1106200  r5 00000001  r6 de83b891  r7 e8d438d0
    r8 00000416  r9 e8d39990  sl e930bf6d  fp e127f910
    ip de855c1c  sp e127f678  lr de83aba1  pc e09a4988  cpsr 200f0030

backtrace:
    #00 pc 00000988  /system/vendor/lib/libcancer.so (_ZN7android6Cancer15destroyInstanceEv+39)
    #01 pc 0001db9d  /system/vendor/lib/libcam.client.so (_ZN7android15NSDisplayClient13DisplayClient4initEv+60)
    #02 pc 0000d145  /system/vendor/lib/libcam.device1.so (_ZN7android14Cam1DeviceBase17initDisplayClientEP18preview_stream_ops+684)

整个tombstone包含的信息有：
(1). 创建1个tombstone文件。
最多10个,如果已存在10个,则覆盖最旧的文件。
(2). 版本信息
主要是fingerprint,可以看出异常版本是eng还是user。
(3). 寄存器信息
主要查看是哪个进程崩溃,信号是什么。寄存器信息需要配合下面的调用栈信息及数据信息结合GNU的工具(objdump -S反汇编)分析。
(4). 调用栈信息
这个是最直接可以看出异常的信息。
(5). 其他线程信息
如果异常线程和其他线程有逻辑关系的话,可以查看对应线程的信息。
(6). main log信息

最后添加一张流程图：

Native Exception产生

一键自动化博客发布工具,用过的人都说好(头条篇)

ARMv8-中斷處理接口

ARM基礎學習-存儲管理單元MMU

ARM基礎學習-快速上下文切換技術

ARM基礎學習-寄存器尋址方式和指令

ARM基礎學習-協處理器CP15

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結