文章托管在gitee上 Android Notes , 同步csdn 本文基于Android12 分析
概述
在Android中,crash大致可以做如下分类:
- Java crash, 通常发生在Java虚拟机层面之上的,如 system_server/app java crash
- Native crash,主要是C/C++ 层面发生的crash,system_server/app也可能发生native crash,因为它们都zygote fork而来,而zygote是运行 app_process 这个native 程序而来。
- kernel crash, 通常会触发kernel panic 死机,通常是因为驱动或硬件导致。
本篇主要是看 Native crash 抓log流程。
实现机制介绍
实现机制主要是基于信号机制和ptrace机制,如下:
- 对于Android中的应用或native程序而言,它在启动时会首先加载linker模块做一些初始化,之后控制权才会回到进程自身的逻辑,因此可以在linker初始化的时候做一些工作,以实现抓取native crash的log, 而在linker init过程,注册了一些 signal 的处理器(linux默认通常是直接kill进程)。
- 当进程异常时收到相关信号,signal 处理器会对信号流程做拦截处理,此处异常进程fork出新进程crash_dump,通过crash_dump去ptrace到异常进程,获取其调用栈、内存等信息,将输出内容写入到tombstoned提供的fd(通过socket连接tombstoned获取输出fd)。
- 当完成dump操作,会重新发送信号kill异常进程(在此操作之前会将signal 处理器重置为默认)。
流程概述图
流程大致如下图所示:
流程分析
接下来,从 linker 的入口_start开始看起。如何分析入口可见参考。
begin.S
ENTRY(_start)
.cfi_undefined x30
mov x0, sp
bl __linker_init
br x0
END(_start)
__linker_init
extern "C" ElfW(Addr) __linker_init(void* raw_args) {
KernelArgumentBlock args(raw_args);
bionic_tcb temp_tcb __attribute__((uninitialized));
linker_memclr(&temp_tcb, sizeof(temp_tcb));
__libc_init_main_thread_early(args, &temp_tcb);
...
if (!tmp_linker_so.prelink_image()) __linker_cannot_link(args.argv[0]);
if (!tmp_linker_so.link_image(SymbolLookupList(&tmp_linker_so), &tmp_linker_so, nullptr, nullptr)) __linker_cannot_link(args.argv[0]);
return __linker_init_post_relocation(args, tmp_linker_so);
}
__linker_init_post_relocation
linker的一些初始化,主要看linker_main函数
static ElfW(Addr) __attribute__((noinline))
__linker_init_post_relocation(KernelArgumentBlock& args, soinfo& tmp_linker_so) {
__libc_init_main_thread_late();
if (!tmp_linker_so.protect_relro()) __linker_cannot_link(args.argv[0]);
set_bss_vma_name(&tmp_linker_so);
__libc_init_globals();
tmp_linker_so.call_constructors();
for (const ElfW(Dyn)* d = tmp_linker_so.dynamic; d->d_tag != DT_NULL; ++d) {
if (d->d_tag == DT_SONAME) {
tmp_linker_so.set_soname(tmp_linker_so.get_string(d->d_un.d_val));
}
}
const char* exe_to_load = nullptr;
if (getauxval(AT_ENTRY) == reinterpret_cast<uintptr_t>(&_start)) {
if (args.argc == 3 && !strcmp(args.argv[1], "--list")) {
g_is_ldd = true;
exe_to_load = args.argv[2];
} else if (args.argc <= 1 || !strcmp(args.argv[1], "--help")) {
async_safe_format_fd(STDOUT_FILENO,
"Usage: %s [--list] PROGRAM [ARGS-FOR-PROGRAM...]\n"
" %s [--list] path.zip!/PROGRAM [ARGS-FOR-PROGRAM...]\n"
"\n"
"A helper program for linking dynamic executables. Typically, the kernel loads\n"
"this program because it's the PT_INTERP of a dynamic executable.\n"
"\n"
"This program can also be run directly to load and run a dynamic executable. The\n"
"executable can be inside a zip file if it's stored uncompressed and at a\n"
"page-aligned offset.\n"
"\n"
"The --list option gives behavior equivalent to ldd(1) on other systems.\n",
args.argv[0], args.argv[0]);
_exit(EXIT_SUCCESS);
} else {
exe_to_load = args.argv[1];
__libc_shared_globals()->initial_linker_arg_count = 1;
}
}
g_argc = args.argc - __libc_shared_globals()->initial_linker_arg_count;
g_argv = args.argv + __libc_shared_globals()->initial_linker_arg_count;
g_envp = args.envp;
__libc_shared_globals()->init_progname = g_argv[0];
sonext = solist = solinker = get_libdl_info(tmp_linker_so);
g_default_namespace.add_soinfo(solinker);
ElfW(Addr) start_address = linker_main(args, exe_to_load);
if (g_is_ldd) _exit(EXIT_SUCCESS);
INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address));
return start_address;
}
linker_main
static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
...
__libc_init_AT_SECURE(args.envp);
__system_properties_init();
platform_properties_init();
linker_debuggerd_init();
...
linker_debuggerd_init
void linker_debuggerd_init() {
debuggerd_callbacks_t callbacks = {
#if defined(__ANDROID_APEX__)
.get_process_info = get_process_info,
#endif
.post_dump = notify_gdb_of_libraries,
};
debuggerd_init(&callbacks);
}
debuggerd_init
void debuggerd_init(debuggerd_callbacks_t* callbacks) {
if (callbacks) {
g_callbacks = *callbacks;
}
size_t thread_stack_pages = 8;
void* thread_stack_allocation = mmap(nullptr, PAGE_SIZE * (thread_stack_pages + 2), PROT_NONE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (thread_stack_allocation == MAP_FAILED) {
fatal_errno("failed to allocate debuggerd thread stack");
}
char* stack = static_cast<char*>(thread_stack_allocation) + PAGE_SIZE;
if (mprotect(stack, PAGE_SIZE * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
fatal_errno("failed to mprotect debuggerd thread stack");
}
stack = (stack + thread_stack_pages * PAGE_SIZE - 1);
stack -= 15;
pseudothread_stack = stack;
struct sigaction action;
memset(&action, 0, sizeof(action));
sigfillset(&action.sa_mask);
action.sa_sigaction = debuggerd_signal_handler;
action.sa_flags = SA_RESTART | SA_SIGINFO;
action.sa_flags |= SA_ONSTACK;
#define SA_EXPOSE_TAGBITS 0x00000800
action.sa_flags |= SA_EXPOSE_TAGBITS;
debuggerd_register_handlers(&action);
}
debuggerd_register_handlers
#define DEBUGGER_SIGNAL BIONIC_SIGNAL_DEBUGGER
static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
char value[PROP_VALUE_MAX] = "";
bool enabled =
!(__system_property_get("ro.debuggable", value) > 0 && !strcmp(value, "1") &&
__system_property_get("debug.debuggerd.disable", value) > 0 && !strcmp(value, "1"));
if (enabled) {
sigaction(SIGABRT, action, nullptr);
sigaction(SIGBUS, action, nullptr);
sigaction(SIGFPE, action, nullptr);
sigaction(SIGILL, action, nullptr);
sigaction(SIGSEGV, action, nullptr);
sigaction(SIGSTKFLT, action, nullptr);
sigaction(SIGSYS, action, nullptr);
sigaction(SIGTRAP, action, nullptr);
}
sigaction(BIONIC_SIGNAL_DEBUGGER, action, nullptr);
}
下面是Android对一些特殊信号的定义:
#define BIONIC_SIGNAL_POSIX_TIMERS (__SIGRTMIN + 0)
#define BIONIC_SIGNAL_BACKTRACE (__SIGRTMIN + 1)
#define BIONIC_SIGNAL_DEBUGGER (__SIGRTMIN + 3)
#define BIONIC_SIGNAL_PROFILER (__SIGRTMIN + 4)
#define BIONIC_SIGNAL_ART_PROFILER (__SIGRTMIN + 6)
#define BIONIC_SIGNAL_FDTRACK (__SIGRTMIN + 7)
#define BIONIC_SIGNAL_RUN_ON_ALL_THREADS (__SIGRTMIN + 8)
信号
在linux环境,执行如下命令,就可以看到各种信号的值及对应的含义:
# kill -l
1 HUP Hangup 23 URG Urgent I/O condition 45 45 Signal 45
2 INT Interrupt 24 XCPU CPU time limit exceeded 46 46 Signal 46
3 QUIT Quit 25 XFSZ File size limit exceeded 47 47 Signal 47
4 ILL Illegal instruction 26 VTALRM Virtual timer expired 48 48 Signal 48
5 TRAP Trap 27 PROF Profiling timer expired 49 49 Signal 49
6 ABRT Aborted 28 WINCH Window size changed 50 50 Signal 50
7 BUS Bus error 29 IO I/O possible 51 51 Signal 51
8 FPE Floating point exception 30 PWR Power failure 52 52 Signal 52
9 KILL Killed 31 SYS Bad system call 53 53 Signal 53
10 USR1 User signal 1 32 32 Signal 32 54 54 Signal 54
11 SEGV Segmentation fault 33 33 Signal 33 55 55 Signal 55
12 USR2 User signal 2 34 34 Signal 34 56 56 Signal 56
13 PIPE Broken pipe 35 35 Signal 35 57 57 Signal 57
14 ALRM Alarm clock 36 36 Signal 36 58 58 Signal 58
15 TERM Terminated 37 37 Signal 37 59 59 Signal 59
16 STKFLT Stack fault 38 38 Signal 38 60 60 Signal 60
17 CHLD Child exited 39 39 Signal 39 61 61 Signal 61
18 CONT Continue 40 40 Signal 40 62 62 Signal 62
19 STOP Stopped (signal) 41 41 Signal 41 63 63 Signal 63
20 TSTP Stopped 42 42 Signal 42 64 64 Signal 64
21 TTIN Stopped (tty input) 43 43 Signal 43
22 TTOU Stopped (tty output) 44 44 Signal 44
比较常见的错误信号如下:
- 11 SEGV Segmentation fault 段错误
- 解引用空指针或未初始化的或已经被释放的指针
- 访问字节对齐错误的内存
- 向只读内存区写操作
- 读写分配的内存区域之外的内存
- 其他内存损坏
- 6 ABRT Aborted 通常是程序主动调用abort ,在tombstone文件一般有abort信息
- 7 SIGBUS Bus error 比如出现的 内存对齐问题
- 4 ILL Illegal instruction 非法指令问题
- 8 FPE Floating point exception 非法算数问题,比较执行除0操作
- 13 PIPE Broken pipe 管道损坏问题,比如向一个已经关闭的socket写
- 3 QUIT Quit Android对应用进程做了拦截处理,可以进行dump trace , 执行 kill -3 $pid
- 35 debuggerd 信号, 使用于Android,用于dump trace
当进程发生crash时,会收到相关信号,之前设置的信号处理器会进行处理
debuggerd_signal_handler
处理流程如下:
- 打印crash信号概述
- clone创建子线程去执行抓dump
- 等待抓dump完成
- 重新发送信号kill自身
static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
ErrnoRestorer restorer;
auto *ucontext = static_cast<ucontext_t*>(context);
if (!have_siginfo(signal_number)) {
info = nullptr;
}
struct siginfo dummy_info = {};
if (!info) {
memset(&dummy_info, 0, sizeof(dummy_info));
dummy_info.si_signo = signal_number;
dummy_info.si_code = SI_USER;
dummy_info.si_pid = __getpid();
dummy_info.si_uid = getuid();
info = &dummy_info;
} else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
}
debugger_process_info process_info = {};
uintptr_t si_val = reinterpret_cast<uintptr_t>(info->si_ptr);
if (signal_number == BIONIC_SIGNAL_DEBUGGER) {
if (info->si_code == SI_QUEUE && info->si_pid == __getpid()) {
if (si_val != kDebuggerdFallbackSivalUintptrRequestDump) {
process_info.abort_msg = reinterpret_cast<void*>(si_val & ~1);
info->si_ptr = reinterpret_cast<void*>(si_val & 1);
}
}
} else if (g_callbacks.get_process_info) {
process_info = g_callbacks.get_process_info();
}
if (si_val == kDebuggerdFallbackSivalUintptrRequestDump ||
prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) == 1) {
debuggerd_fallback_handler(info, ucontext, process_info.abort_msg);
resend_signal(info);
return;
}
int ret = pthread_mutex_lock(&crash_mutex);
if (ret != 0) {
async_safe_format_log(ANDROID_LOG_INFO, "libc", "pthread_mutex_lock failed: %s", strerror(ret));
return;
}
log_signal_summary(info);
debugger_thread_info thread_info = {
.crashing_tid = __gettid(),
.pseudothread_tid = -1,
.siginfo = info,
.ucontext = context,
.process_info = process_info,
};
int orig_dumpable = prctl(PR_GET_DUMPABLE);
if (prctl(PR_SET_DUMPABLE, 1) != 0) {
fatal_errno("failed to set dumpable");
}
bool restore_orig_ptracer = true;
if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) != 0) {
if (errno == EINVAL) {
restore_orig_ptracer = false;
} else {
fatal_errno("failed to set traceable");
}
}
pid_t child_pid =
clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
&thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
if (child_pid == -1) {
fatal_errno("failed to spawn debuggerd dispatch thread");
}
futex_wait(&thread_info.pseudothread_tid, -1);
futex_wait(&thread_info.pseudothread_tid, child_pid);
if (prctl(PR_SET_DUMPABLE, orig_dumpable) != 0) {
fatal_errno("failed to restore dumpable");
}
if (restore_orig_ptracer && prctl(PR_SET_PTRACER, 0) != 0) {
fatal_errno("failed to restore traceable");
}
if (info->si_signo == BIONIC_SIGNAL_DEBUGGER) {
pthread_mutex_unlock(&crash_mutex);
} else {
resend_signal(info);
}
}
先看下resend_signal,因为中间的流程过长,影响分析。这个方法主要是确保进程在执行dump后,kill掉自身,不管是否真正成功dump。当然,它会等待crash_dump执行完或者后者退出(比如执行发送异常),但是crash_dump不会去抓自身的异常,防止无限循环dump。
resend_signal
static void resend_signal(siginfo_t* info) {
if (info->si_signo != BIONIC_SIGNAL_DEBUGGER) {
signal(info->si_signo, SIG_DFL);
int rc = syscall(SYS_rt_tgsigqueueinfo, __getpid(), __gettid(), info->si_signo, info);
if (rc != 0) {
fatal_errno("failed to resend signal during crash");
}
}
}
debuggerd_dispatch_pseudothread
在pseudothread线程处理抓crash trace流程,工作流程如下:
- pipe 创建 pseudothread 与 crash_dump 双向通信管道
- 写入 crash信息到管道,后续到 crash_dump 读取会用到
- fork子进程执行 crash_dump, 真正去抓log
- 阻塞等待 crash_dump pipe回复信息,收到继续执行 创建 vm process
- waitpid 等待 crash_dump 结束
- 等待dump进程退出
static int debuggerd_dispatch_pseudothread(void* arg) {
debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);
for (int i = 0; i < 1024; ++i) {
syscall(__NR_close, i);
}
int devnull = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR));
if (devnull == -1) {
fatal_errno("failed to open /dev/null");
} else if (devnull != 0) {
fatal_errno("expected /dev/null fd to be 0, actually %d", devnull);
}
TEMP_FAILURE_RETRY(dup2(devnull, 1));
TEMP_FAILURE_RETRY(dup2(devnull, 2));
unique_fd input_read, input_write;
unique_fd output_read, output_write;
if (!Pipe(&input_read, &input_write) != 0 || !Pipe(&output_read, &output_write)) {
fatal_errno("failed to create pipe");
}
uint32_t version;
ssize_t expected;
struct iovec iovs[4] = {
{.iov_base = &version, .iov_len = sizeof(version)},
{.iov_base = thread_info->siginfo, .iov_len = sizeof(siginfo_t)},
{.iov_base = thread_info->ucontext, .iov_len = sizeof(ucontext_t)},
};
if (thread_info->process_info.fdsan_table) {
version = 4;
expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataDynamic);
iovs[3] = {.iov_base = &thread_info->process_info,
.iov_len = sizeof(thread_info->process_info)};
} else {
version = 1;
expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataStatic);
iovs[3] = {.iov_base = &thread_info->process_info.abort_msg, .iov_len = sizeof(uintptr_t)};
}
errno = 0;
if (fcntl(output_write.get(), F_SETPIPE_SZ, expected) < static_cast<int>(expected)) {
fatal_errno("failed to set pipe buffer size");
}
ssize_t rc = TEMP_FAILURE_RETRY(writev(output_write.get(), iovs, arraysize(iovs)));
if (rc == -1) {
fatal_errno("failed to write crash info");
} else if (rc != expected) {
fatal("failed to write crash info, wrote %zd bytes, expected %zd", rc, expected);
}
pid_t crash_dump_pid = __fork();
if (crash_dump_pid == -1) {
async_safe_format_log(ANDROID_LOG_FATAL, "libc",
"failed to fork in debuggerd signal handler: %s", strerror(errno));
} else if (crash_dump_pid == 0) {
TEMP_FAILURE_RETRY(dup2(input_write.get(), STDOUT_FILENO));
TEMP_FAILURE_RETRY(dup2(output_read.get(), STDIN_FILENO));
input_read.reset();
input_write.reset();
output_read.reset();
output_write.reset();
raise_caps();
char main_tid[10];
char pseudothread_tid[10];
char debuggerd_dump_type[10];
async_safe_format_buffer(main_tid, sizeof(main_tid), "%d", thread_info->crashing_tid);
async_safe_format_buffer(pseudothread_tid, sizeof(pseudothread_tid), "%d",
thread_info->pseudothread_tid);
async_safe_format_buffer(debuggerd_dump_type, sizeof(debuggerd_dump_type), "%d",
get_dump_type(thread_info));
execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
nullptr, nullptr);
async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to exec crash_dump helper: %s",
strerror(errno));
return 1;
}
input_write.reset();
output_read.reset();
char buf[4];
rc = TEMP_FAILURE_RETRY(read(input_read.get(), &buf, sizeof(buf)));
bool success = false;
if (rc == 1 && buf[0] == '\1') {
create_vm_process();
success = true;
} else {
if (rc == -1) {
async_safe_format_log(ANDROID_LOG_FATAL, "libc", "read of IPC pipe failed: %s",
strerror(errno));
} else if (rc == 0) {
async_safe_format_log(ANDROID_LOG_FATAL, "libc",
"crash_dump helper failed to exec, or was killed");
} else if (rc != 1) {
async_safe_format_log(ANDROID_LOG_FATAL, "libc",
"read of IPC pipe returned unexpected value: %zd", rc);
} else if (buf[0] != '\1') {
async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper reported failure");
}
}
int status;
if (TEMP_FAILURE_RETRY(waitpid(crash_dump_pid, &status, 0)) == -1) {
async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to wait for crash_dump helper: %s",
strerror(errno));
} else if (WIFSTOPPED(status) || WIFSIGNALED(status)) {
async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper crashed or stopped");
}
if (success) {
if (thread_info->siginfo->si_signo != BIONIC_SIGNAL_DEBUGGER) {
TEMP_FAILURE_RETRY(read(input_read, &buf, sizeof(buf)));
}
}
return success ? 0 : 1;
}
crash_dump#main
主要工作如下:
- 重置信号处理器,防止自身异常时dump自身
- 设置pipe信号处理器
- fork子进程进程dump,其工作如下
- 设置30s的alarm,防止dump过长或长时间卡住
- 收集打开的文件描述符
- 获取所有线程信息
- PTRACE_O_TRACECLONE 监听pseudothread clone,并通知其继续,后者会创建vm process,获取一份内存拷贝
- 连接tombstoned,获取trace输出的tombstone临时文件fd
- engrave_tombstone 输出trace内容到 tombstone fd
- 通知ams发生native crash 事件
- 通知 tombstoned 已完成dump,后者完成tombstone临时文件重命名
int main(int argc, char** argv) {
DefuseSignalHandlers();
InstallSigPipeHandler();
setsid();
atrace_begin(ATRACE_TAG, "before reparent");
pid_t target_process = getppid();
std::string target_proc_path = "/proc/" + std::to_string(target_process);
int target_proc_fd = open(target_proc_path.c_str(), O_DIRECTORY | O_RDONLY);
if (target_proc_fd == -1) {
PLOG(FATAL) << "failed to open " << target_proc_path;
}
if (getppid() != target_process) {
LOG(FATAL) << "parent died";
}
atrace_end(ATRACE_TAG);
unique_fd output_pipe(dup(STDOUT_FILENO));
unique_fd input_pipe(dup(STDIN_FILENO));
unique_fd fork_exit_read, fork_exit_write;
if (!Pipe(&fork_exit_read, &fork_exit_write)) {
PLOG(FATAL) << "failed to create pipe";
}
pid_t forkpid = fork();
if (forkpid == -1) {
PLOG(FATAL) << "fork failed";
} else if (forkpid == 0) {
fork_exit_read.reset();
} else {
fork_exit_write.reset();
char buf;
TEMP_FAILURE_RETRY(read(fork_exit_read.get(), &buf, sizeof(buf)));
_exit(0);
}
ATRACE_NAME("after reparent");
pid_t pseudothread_tid;
DebuggerdDumpType dump_type;
ProcessInfo process_info;
Initialize(argv);
ParseArgs(argc, argv, &pseudothread_tid, &dump_type);
alarm(30 * android::base::HwTimeoutMultiplier());
OpenFilesList open_files;
{
ATRACE_NAME("open files");
populate_open_files_list(&open_files, g_target_thread);
}
std::set<pid_t> threads;
if (!android::procinfo::GetProcessTids(g_target_thread, &threads)) {
PLOG(FATAL) << "failed to get process threads";
}
std::map<pid_t, ThreadInfo> thread_info;
siginfo_t siginfo;
std::string error;
{
ATRACE_NAME("ptrace");
for (pid_t thread : threads) {
if (thread == pseudothread_tid) {
continue;
}
if (!ptrace_seize_thread(target_proc_fd, thread, &error)) {
bool fatal = thread == g_target_thread;
LOG(fatal ? FATAL : WARNING) << error;
}
ThreadInfo info;
info.pid = target_process;
info.tid = thread;
info.uid = getuid();
info.thread_name = get_thread_name(thread);
unique_fd attr_fd(openat(target_proc_fd, "attr/current", O_RDONLY | O_CLOEXEC));
if (!android::base::ReadFdToString(attr_fd, &info.selinux_label)) {
PLOG(WARNING) << "failed to read selinux label";
}
if (!ptrace_interrupt(thread, &info.signo)) {
PLOG(WARNING) << "failed to ptrace interrupt thread " << thread;
ptrace(PTRACE_DETACH, thread, 0, 0);
continue;
}
struct iovec iov = {
&info.tagged_addr_ctrl,
sizeof(info.tagged_addr_ctrl),
};
if (ptrace(PTRACE_GETREGSET, thread, NT_ARM_TAGGED_ADDR_CTRL,
reinterpret_cast<void*>(&iov)) == -1) {
info.tagged_addr_ctrl = -1;
}
if (thread == g_target_thread) {
ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info);
info.siginfo = &siginfo;
info.signo = info.siginfo->si_signo;
info.command_line = get_command_line(g_target_thread);
} else {
info.registers.reset(unwindstack::Regs::RemoteGet(thread));
if (!info.registers) {
PLOG(WARNING) << "failed to fetch registers for thread " << thread;
ptrace(PTRACE_DETACH, thread, 0, 0);
continue;
}
}
thread_info[thread] = std::move(info);
}
}
if (!ptrace_seize_thread(target_proc_fd, pseudothread_tid, &error, PTRACE_O_TRACECLONE)) {
LOG(FATAL) << "failed to seize pseudothread: " << error;
}
if (TEMP_FAILURE_RETRY(write(output_pipe.get(), "\1", 1)) != 1) {
PLOG(FATAL) << "failed to write to pseudothread";
}
pid_t vm_pid = wait_for_vm_process(pseudothread_tid);
if (ptrace(PTRACE_DETACH, pseudothread_tid, 0, 0) != 0) {
PLOG(FATAL) << "failed to detach from pseudothread";
}
fork_exit_write.reset();
bool wait_for_debugger = android::base::GetBoolProperty(
"debug.debuggerd.wait_for_debugger",
android::base::GetBoolProperty("debug.debuggerd.wait_for_gdb", false));
if (siginfo.si_signo == BIONIC_SIGNAL_DEBUGGER) {
wait_for_debugger = false;
}
for (const auto& [tid, thread] : thread_info) {
int resume_signal = thread.signo == BIONIC_SIGNAL_DEBUGGER ? 0 : thread.signo;
if (wait_for_debugger) {
resume_signal = 0;
if (tgkill(target_process, tid, SIGSTOP) != 0) {
PLOG(WARNING) << "failed to send SIGSTOP to " << tid;
}
}
LOG(DEBUG) << "detaching from thread " << tid;
if (ptrace(PTRACE_DETACH, tid, 0, resume_signal) != 0) {
PLOG(ERROR) << "failed to detach from thread " << tid;
}
}
drop_capabilities();
{
ATRACE_NAME("tombstoned_connect");
LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
g_tombstoned_connected = tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd,
&g_proto_fd, dump_type);
}
if (g_tombstoned_connected) {
if (TEMP_FAILURE_RETRY(dup2(g_output_fd.get(), STDOUT_FILENO)) == -1) {
PLOG(ERROR) << "failed to dup2 output fd (" << g_output_fd.get() << ") to STDOUT_FILENO";
}
} else {
unique_fd devnull(TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR)));
TEMP_FAILURE_RETRY(dup2(devnull.get(), STDOUT_FILENO));
g_output_fd = std::move(devnull);
}
LOG(INFO) << "performing dump of process " << target_process
<< " (target tid = " << g_target_thread << ")";
int signo = siginfo.si_signo;
bool fatal_signal = signo != BIONIC_SIGNAL_DEBUGGER;
bool backtrace = false;
if (!fatal_signal) {
int si_val = siginfo.si_value.sival_int;
if (si_val == 0) {
backtrace = false;
} else if (si_val == 1) {
backtrace = true;
} else {
LOG(WARNING) << "unknown si_value value " << si_val;
}
}
unwindstack::UnwinderFromPid unwinder(256, vm_pid, unwindstack::Regs::CurrentArch());
if (!unwinder.Init()) {
LOG(FATAL) << "Failed to init unwinder object.";
}
std::string amfd_data;
if (backtrace) {
ATRACE_NAME("dump_backtrace");
dump_backtrace(std::move(g_output_fd), &unwinder, thread_info, g_target_thread);
} else {
{
ATRACE_NAME("fdsan table dump");
populate_fdsan_table(&open_files, unwinder.GetProcessMemory(),
process_info.fdsan_table_address);
}
{
ATRACE_NAME("engrave_tombstone");
engrave_tombstone(std::move(g_output_fd), std::move(g_proto_fd), &unwinder, thread_info,
g_target_thread, process_info, &open_files, &amfd_data);
}
}
if (fatal_signal) {
if (thread_info[target_process].thread_name != "system_server") {
activity_manager_notify(target_process, signo, amfd_data);
}
}
if (wait_for_debugger) {
ALOGI(
"***********************************************************\n"
"* Process %d has been suspended while crashing.\n"
"* To attach the debugger, run this on the host:\n"
"*\n"
"* gdbclient.py -p %d\n"
"*\n"
"***********************************************************",
target_process, target_process);
}
close(STDOUT_FILENO);
if (g_tombstoned_connected && !tombstoned_notify_completion(g_tombstoned_socket.get())) {
LOG(ERROR) << "failed to notify tombstoned of completion";
}
return 0;
}
大致总结下上面流程:
- 当进程发生crash时,在linker的linker_debuggerd_init过程设置的信号处理器函数,即debuggerd_signal_handler将被调用
- 在信号处理过程,首先clone创建了子线程pseudothread去执行抓dump,它会进一步fork子进程执行crash_dump,也就是dump操作实际上是在 crash_dump 中完成的。
- crash_dump fork子进程进程dump,后者通过 ptrace 来获取crash进程相关信息,之后连接tombstoned,获取trace输出到tombstone的文件fd,通过 engrave_tombstone 输出trace内容到 tombstone 文件,后面还会通知ams发生native crash 事件
- pseudothread 等待回收 crash_dump 进程并等待抓dump完成
- debuggerd_signal_handler等待pseudothread结束,之后重新发送信号kill自身
下面继续看 crash_dump 的 main函数执行流程细节。
DefuseSignalHandlers
重置signal处理器和设置sigset, 防止去dump自身crash,以防反复去dump
static void DefuseSignalHandlers() {
struct sigaction action = {};
action.sa_handler = SIG_DFL;
debuggerd_register_handlers(&action);
sigset_t mask;
sigemptyset(&mask);
if (sigprocmask(SIG_SETMASK, &mask, nullptr) != 0) {
PLOG(FATAL) << "failed to set signal mask";
}
}
InstallSigPipeHandler
static void InstallSigPipeHandler() {
struct sigaction action = {};
action.sa_handler = SIG_IGN;
action.sa_flags = SA_RESTART;
sigaction(SIGPIPE, &action, nullptr);
}
连接tombstoned
int main(int argc, char** argv) {
...
{
ATRACE_NAME("tombstoned_connect");
LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
g_tombstoned_connected = tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd,
&g_proto_fd, dump_type);
}
...
tombstoned_connect
通过socket连接tombstoned ,写入dump请求,并接收返回的fd,用来输出trace
bool tombstoned_connect(pid_t pid, unique_fd* tombstoned_socket, unique_fd* text_output_fd,
unique_fd* proto_output_fd, DebuggerdDumpType dump_type) {
unique_fd sockfd(
socket_local_client((dump_type != kDebuggerdJavaBacktrace ? kTombstonedCrashSocketName
: kTombstonedJavaTraceSocketName),
ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET));
if (sockfd == -1) {
async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to connect to tombstoned: %s",
strerror(errno));
return false;
}
TombstonedCrashPacket packet = {};
packet.packet_type = CrashPacketType::kDumpRequest;
packet.packet.dump_request.pid = pid;
packet.packet.dump_request.dump_type = dump_type;
if (TEMP_FAILURE_RETRY(write(sockfd, &packet, sizeof(packet))) != sizeof(packet)) {
async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to write DumpRequest packet: %s",
strerror(errno));
return false;
}
unique_fd tmp_output_fd, tmp_proto_fd;
ssize_t rc = -1;
if (dump_type == kDebuggerdTombstoneProto) {
rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd, &tmp_proto_fd);
} else {
rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd);
}
if (rc == -1) {
async_safe_format_log(ANDROID_LOG_ERROR, "libc",
"failed to read response to DumpRequest packet: %s", strerror(errno));
return false;
} else if (rc != sizeof(packet)) {
async_safe_format_log(
ANDROID_LOG_ERROR, "libc",
"received DumpRequest response packet of incorrect length (expected %zu, got %zd)",
sizeof(packet), rc);
return false;
}
int flags = fcntl(tmp_output_fd.get(), F_GETFL);
if (fcntl(tmp_output_fd.get(), F_SETFL, flags | O_APPEND) != 0) {
async_safe_format_log(ANDROID_LOG_WARN, "libc", "failed to set output fd flags: %s",
strerror(errno));
}
*tombstoned_socket = std::move(sockfd);
*text_output_fd = std::move(tmp_output_fd);
if (proto_output_fd) {
*proto_output_fd = std::move(tmp_proto_fd);
}
return true;
}
下面简单插入下 tombstoned 的启动流程,在tombstoned.rc中有如下配置,可以发现它被当做了一个可以启动的“服务”,同时在init启动它的时候会给它创建三个socket。
service tombstoned /system/bin/tombstoned
user tombstoned
group system
socket tombstoned_crash seqpacket 0666 system system
socket tombstoned_intercept seqpacket 0666 system system
socket tombstoned_java_trace seqpacket 0666 system system
writepid /dev/cpuset/system-background/tasks
那它是在哪启动的呢? 在Android12中是直接写在init.rc中,在post-fs-data流程进行启动。而在Android11是配置在tombstoned.rc,时机也是post-fs-data,通过它的注释可以知道修改是为了早点启动来抓tombstone。
on post-fs-data
...
# Start tombstoned early to be able to store tombstones.
mkdir /data/anr 0775 system system encryption=Require
mkdir /data/tombstones 0771 system system encryption=Require
mkdir /data/vendor/tombstones 0771 root root
mkdir /data/vendor/tombstones/wifi 0771 wifi wifi
start tombstoned # 启动 tombstoned
接下来看 tombstoned 的处理。
tombstoned#main
在main方法里面获取socket,并设置事件监听。
int main(int, char* []) {
...
int crash_socket = android_get_control_socket(kTombstonedCrashSocketName);
...
evconnlistener* tombstone_listener =
evconnlistener_new(base, crash_accept_cb, CrashQueue::for_tombstones(), LEV_OPT_CLOSE_ON_FREE,
-1 , crash_socket);
...
}
执行 accept 后回调 crash_accept_cb
crash_accept_cb
static void crash_accept_cb(evconnlistener* listener, evutil_socket_t sockfd, sockaddr*, int,
void*) {
event_base* base = evconnlistener_get_base(listener);
Crash* crash = new Crash();
struct timeval timeout = {1 * android::base::HwTimeoutMultiplier(), 0};
event* crash_event = event_new(base, sockfd, EV_TIMEOUT | EV_READ, crash_request_cb, crash);
crash->crash_socket_fd.reset(sockfd);
crash->crash_event = crash_event;
event_add(crash_event, &timeout);
}
收到client写入的dump请求后,会触发read事件,执行 crash_request_cb
crash_request_cb
static void crash_request_cb(evutil_socket_t sockfd, short ev, void* arg) {
std::unique_ptr<Crash> crash(static_cast<Crash*>(arg));
TombstonedCrashPacket request = {};
if ((ev & EV_TIMEOUT) != 0) {
LOG(WARNING) << "crash request timed out";
return;
} else if ((ev & EV_READ) == 0) {
LOG(WARNING) << "tombstoned received unexpected event from crash socket";
return;
}
ssize_t rc = TEMP_FAILURE_RETRY(read(sockfd, &request, sizeof(request)));
if (rc == -1) {
PLOG(WARNING) << "failed to read from crash socket";
return;
} else if (rc != sizeof(request)) {
LOG(WARNING) << "crash socket received short read of length " << rc << " (expected "
<< sizeof(request) << ")";
return;
}
if (request.packet_type != CrashPacketType::kDumpRequest) {
LOG(WARNING) << "unexpected crash packet type, expected kDumpRequest, received "
<< StringPrintf("%#2hhX", request.packet_type);
return;
}
crash->crash_type = request.packet.dump_request.dump_type;
if (crash->crash_type < 0 || crash->crash_type > kDebuggerdTombstoneProto) {
LOG(WARNING) << "unexpected crash dump type: " << crash->crash_type;
return;
}
if (crash->crash_type != kDebuggerdJavaBacktrace) {
crash->crash_pid = request.packet.dump_request.pid;
} else {
ucred cr = {};
socklen_t len = sizeof(cr);
int ret = getsockopt(sockfd, SOL_SOCKET, SO_PEERCRED, &cr, &len);
if (ret != 0) {
PLOG(ERROR) << "Failed to getsockopt(..SO_PEERCRED)";
return;
}
crash->crash_pid = cr.pid;
}
pid_t crash_pid = crash->crash_pid;
LOG(INFO) << "received crash request for pid " << crash_pid;
if (CrashQueue::for_crash(crash)->maybe_enqueue_crash(std::move(crash))) {
LOG(INFO) << "enqueueing crash request for pid " << crash_pid;
} else {
perform_request(std::move(crash));
}
}
perform_request
static void perform_request(std::unique_ptr<Crash> crash) {
unique_fd output_fd;
bool intercepted =
intercept_manager->GetIntercept(crash->crash_pid, crash->crash_type, &output_fd);
if (intercepted) {
if (crash->crash_type == kDebuggerdTombstoneProto) {
crash->output.proto = CrashArtifact::devnull();
}
} else {
if (auto o = CrashQueue::for_crash(crash.get())->get_output(crash->crash_type); o) {
crash->output = std::move(*o);
output_fd.reset(dup(crash->output.text.fd));
} else {
LOG(ERROR) << "failed to get crash output for type " << crash->crash_type;
return;
}
}
TombstonedCrashPacket response = {.packet_type = CrashPacketType::kPerformDump};
ssize_t rc = -1;
if (crash->output.proto) {
rc = SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get(),
crash->output.proto->fd.get());
} else {
rc = SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get());
}
output_fd.reset();
if (rc == -1) {
PLOG(WARNING) << "failed to send response to CrashRequest";
return;
} else if (rc != sizeof(response)) {
PLOG(WARNING) << "crash socket write returned short";
return;
}
struct timeval timeout = {10 * android::base::HwTimeoutMultiplier(), 0};
event_base* base = event_get_base(crash->crash_event);
event_assign(crash->crash_event, base, crash->crash_socket_fd, EV_TIMEOUT | EV_READ,
crash_completed_cb, crash.get());
event_add(crash->crash_event, &timeout);
CrashQueue::for_crash(crash)->on_crash_started();
crash.release();
}
CrashQueue::get_output
获取输出trace的文件fd
std::optional<CrashOutput> get_output(DebuggerdDumpType dump_type) {
CrashOutput result;
switch (dump_type) {
case kDebuggerdNativeBacktrace:
return {};
case kDebuggerdTombstoneProto:
if (!supports_proto_) {
LOG(ERROR) << "received kDebuggerdTombstoneProto on a queue that doesn't support proto";
return {};
}
result.proto = create_temporary_file();
result.text = create_temporary_file();
break;
case kDebuggerdJavaBacktrace:
case kDebuggerdTombstone:
result.text = create_temporary_file();
break;
default:
LOG(ERROR) << "unexpected dump type: " << dump_type;
return {};
}
return result;
}
看一下 create_temporary_file 函数的实现:
CrashArtifact create_temporary_file() const {
CrashArtifact result;
std::optional<std::string> path;
result.fd.reset(openat(dir_fd_, ".", O_WRONLY | O_APPEND | O_TMPFILE | O_CLOEXEC, 0660));
if (result.fd == -1) {
static size_t counter = 0;
std::string tmp_filename = StringPrintf(".temporary%zu", counter++);
result.fd.reset(openat(dir_fd_, tmp_filename.c_str(),
O_WRONLY | O_APPEND | O_CREAT | O_TRUNC | O_CLOEXEC, 0660));
if (result.fd == -1) {
PLOG(FATAL) << "failed to create temporary tombstone in " << dir_path_;
}
result.temporary_path = std::move(tmp_filename);
}
return std::move(result);
}
engrave_tombstone
输出crash信息到tombstone文件:
- 基本信息,如时间
- crash线程的信息
- 更多log,如 system、main log
- 其他线程的信息
- 打开的文件信息
void engrave_tombstone(unique_fd output_fd, unique_fd proto_fd, unwindstack::Unwinder* unwinder,
const std::map<pid_t, ThreadInfo>& threads, pid_t target_thread,
const ProcessInfo& process_info, OpenFilesList* open_files,
std::string* amfd_data) {
Tombstone tombstone;
engrave_tombstone_proto(&tombstone, unwinder, threads, target_thread, process_info, open_files);
if (proto_fd != -1) {
if (!tombstone.SerializeToFileDescriptor(proto_fd.get())) {
async_safe_format_log(ANDROID_LOG_ERROR, LOG_TAG, "failed to write proto tombstone: %s",
strerror(errno));
}
}
log_t log;
log.current_tid = target_thread;
log.crashed_tid = target_thread;
log.tfd = output_fd.get();
log.amfd_data = amfd_data;
bool translate_proto = GetBoolProperty("debug.debuggerd.translate_proto_to_text", true);
if (translate_proto) {
tombstone_proto_to_text(tombstone, [&log](const std::string& line, bool should_log) {
_LOG(&log, should_log ? logtype::HEADER : logtype::LOGS, "%s\n", line.c_str());
});
} else {
bool want_logs = GetBoolProperty("ro.debuggable", false);
_LOG(&log, logtype::HEADER,
"*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
dump_header_info(&log);
_LOG(&log, logtype::HEADER, "Timestamp: %s\n", get_timestamp().c_str());
auto it = threads.find(target_thread);
if (it == threads.end()) {
async_safe_fatal("failed to find target thread");
}
dump_thread(&log, unwinder, it->second, process_info, true);
if (want_logs) {
dump_logs(&log, it->second.pid, 50);
}
for (auto& [tid, thread_info] : threads) {
if (tid == target_thread) {
continue;
}
dump_thread(&log, unwinder, thread_info, process_info, false);
}
if (open_files) {
_LOG(&log, logtype::OPEN_FILES, "\nopen files:\n");
dump_open_files_list(&log, *open_files, " ");
}
if (want_logs) {
dump_logs(&log, it->second.pid, 0);
}
}
}
_LOG
输出log到指定fd和logcat
__attribute__((__weak__, visibility("default")))
void _LOG(log_t* log, enum logtype ltype, const char* fmt, ...) {
va_list ap;
va_start(ap, fmt);
_VLOG(log, ltype, fmt, ap);
va_end(ap);
}
__attribute__((__weak__, visibility("default")))
void _VLOG(log_t* log, enum logtype ltype, const char* fmt, va_list ap) {
bool write_to_tombstone = (log->tfd != -1);
bool write_to_logcat = is_allowed_in_logcat(ltype)
&& log->crashed_tid != -1
&& log->current_tid != -1
&& (log->crashed_tid == log->current_tid);
static bool write_to_kmsg = should_write_to_kmsg();
std::string msg;
android::base::StringAppendV(&msg, fmt, ap);
if (msg.empty()) return;
if (write_to_tombstone) {
TEMP_FAILURE_RETRY(write(log->tfd, msg.c_str(), msg.size()));
}
if (write_to_logcat) {
__android_log_buf_write(LOG_ID_CRASH, ANDROID_LOG_FATAL, LOG_TAG, msg.c_str());
if (log->amfd_data != nullptr) {
*log->amfd_data += msg;
}
if (write_to_kmsg) {
unique_fd kmsg_fd(open("/dev/kmsg_debug", O_WRONLY | O_APPEND | O_CLOEXEC));
if (kmsg_fd.get() >= 0) {
if (msg.back() == '\n') {
msg.back() = '\0';
}
std::vector<std::string> fragments = android::base::Split(msg, "\n");
for (const std::string& fragment : fragments) {
static constexpr char prefix[] = "<3>DEBUG: ";
struct iovec iov[3];
iov[0].iov_base = const_cast<char*>(prefix);
iov[0].iov_len = strlen(prefix);
iov[1].iov_base = const_cast<char*>(fragment.c_str());
iov[1].iov_len = fragment.length();
iov[2].iov_base = const_cast<char*>("\n");
iov[2].iov_len = 1;
TEMP_FAILURE_RETRY(writev(kmsg_fd.get(), iov, 3));
}
}
}
}
}
dump_thread
dump线程信息:
- 打印pid、name 等信息
- 打印信号相关信息
- 尝试打印 abort 信息
- 打印寄存器信息
- 打印 backtrace
- 打印内存信息
static bool dump_thread(log_t* log, unwindstack::Unwinder* unwinder, const ThreadInfo& thread_info,
const ProcessInfo& process_info, bool primary_thread) {
log->current_tid = thread_info.tid;
if (!primary_thread) {
_LOG(log, logtype::THREAD, "--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---\n");
}
dump_thread_info(log, thread_info);
if (thread_info.siginfo) {
dump_signal_info(log, thread_info, process_info, unwinder->GetProcessMemory().get());
}
std::unique_ptr<GwpAsanCrashData> gwp_asan_crash_data;
std::unique_ptr<ScudoCrashData> scudo_crash_data;
if (primary_thread) {
gwp_asan_crash_data = std::make_unique<GwpAsanCrashData>(unwinder->GetProcessMemory().get(),
process_info, thread_info);
scudo_crash_data =
std::make_unique<ScudoCrashData>(unwinder->GetProcessMemory().get(), process_info);
}
if (primary_thread && gwp_asan_crash_data->CrashIsMine()) {
gwp_asan_crash_data->DumpCause(log);
} else if (thread_info.siginfo && !(primary_thread && scudo_crash_data->CrashIsMine())) {
dump_probable_cause(log, thread_info.siginfo, unwinder->GetMaps(), thread_info.registers.get());
}
if (primary_thread) {
dump_abort_message(log, unwinder->GetProcessMemory().get(), process_info.abort_msg_address);
}
dump_registers(log, thread_info.registers.get());
std::unique_ptr<unwindstack::Regs> regs_copy(thread_info.registers->Clone());
unwinder->SetRegs(regs_copy.get());
unwinder->Unwind();
if (unwinder->NumFrames() == 0) {
_LOG(log, logtype::THREAD, "Failed to unwind\n");
if (unwinder->LastErrorCode() != unwindstack::ERROR_NONE) {
_LOG(log, logtype::THREAD, " Error code: %s\n", unwinder->LastErrorCodeString());
_LOG(log, logtype::THREAD, " Error address: 0x%" PRIx64 "\n", unwinder->LastErrorAddress());
}
} else {
_LOG(log, logtype::BACKTRACE, "\nbacktrace:\n");
log_backtrace(log, unwinder, " ");
}
if (primary_thread) {
if (gwp_asan_crash_data->HasDeallocationTrace()) {
gwp_asan_crash_data->DumpDeallocationTrace(log, unwinder);
}
if (gwp_asan_crash_data->HasAllocationTrace()) {
gwp_asan_crash_data->DumpAllocationTrace(log, unwinder);
}
scudo_crash_data->DumpCause(log, unwinder);
unwindstack::Maps* maps = unwinder->GetMaps();
dump_memory_and_code(log, maps, unwinder->GetProcessMemory().get(),
thread_info.registers.get());
if (maps != nullptr) {
uint64_t addr = 0;
if (process_info.has_fault_address) {
addr = process_info.untagged_fault_address;
}
dump_all_maps(log, unwinder, addr);
}
}
log->current_tid = log->crashed_tid;
return true;
}
dump_thread_info
线程相关信息,在tombstone可以看到
static void dump_thread_info(log_t* log, const ThreadInfo& thread_info) {
if (thread_info.uid == AID_LOGD) log->should_retrieve_logcat = false;
const char* process_name = "<unknown>";
if (!thread_info.command_line.empty()) {
process_name = thread_info.command_line[0].c_str();
}
_LOG(log, logtype::HEADER, "pid: %d, tid: %d, name: %s >>> %s <<<\n", thread_info.pid,
thread_info.tid, thread_info.thread_name.c_str(), process_name);
_LOG(log, logtype::HEADER, "uid: %d\n", thread_info.uid);
if (thread_info.tagged_addr_ctrl != -1) {
_LOG(log, logtype::HEADER, "tagged_addr_ctrl: %016lx\n", thread_info.tagged_addr_ctrl);
}
}
dump_signal_info
打印信号信息
static void dump_signal_info(log_t* log, const ThreadInfo& thread_info,
const ProcessInfo& process_info, unwindstack::Memory* process_memory) {
char addr_desc[64];
if (process_info.has_fault_address) {
size_t addr = process_info.maybe_tagged_fault_address;
if (thread_info.siginfo->si_signo == SIGILL) {
uint32_t instruction = {};
process_memory->Read(addr, &instruction, sizeof(instruction));
snprintf(addr_desc, sizeof(addr_desc), "0x%zx (*pc=%#08x)", addr, instruction);
} else {
snprintf(addr_desc, sizeof(addr_desc), "0x%zx", addr);
}
} else {
snprintf(addr_desc, sizeof(addr_desc), "--------");
}
char sender_desc[32] = {};
if (signal_has_sender(thread_info.siginfo, thread_info.pid)) {
get_signal_sender(sender_desc, sizeof(sender_desc), thread_info.siginfo);
}
_LOG(log, logtype::HEADER, "signal %d (%s), code %d (%s%s), fault addr %s\n",
thread_info.siginfo->si_signo, get_signame(thread_info.siginfo),
thread_info.siginfo->si_code, get_sigcode(thread_info.siginfo), sender_desc, addr_desc);
}
get_signame 获取信号的描述
const char* get_signame(const siginfo_t* si) {
switch (si->si_signo) {
case SIGABRT: return "SIGABRT";
case SIGBUS: return "SIGBUS";
case SIGFPE: return "SIGFPE";
case SIGILL: return "SIGILL";
case SIGSEGV: return "SIGSEGV";
case SIGSTKFLT: return "SIGSTKFLT";
case SIGSTOP: return "SIGSTOP";
case SIGSYS: return "SIGSYS";
case SIGTRAP: return "SIGTRAP";
case BIONIC_SIGNAL_DEBUGGER:
return "<debuggerd signal>";
default: return "?";
}
}
get_sigcode 获取信号具体错误描述
const char* get_sigcode(const siginfo_t* si) {
switch (si->si_signo) {
case SIGILL:
switch (si->si_code) {
case ILL_ILLOPC: return "ILL_ILLOPC";
case ILL_ILLOPN: return "ILL_ILLOPN";
case ILL_ILLADR: return "ILL_ILLADR";
case ILL_ILLTRP: return "ILL_ILLTRP";
case ILL_PRVOPC: return "ILL_PRVOPC";
case ILL_PRVREG: return "ILL_PRVREG";
case ILL_COPROC: return "ILL_COPROC";
case ILL_BADSTK: return "ILL_BADSTK";
case ILL_BADIADDR:
return "ILL_BADIADDR";
case __ILL_BREAK:
return "ILL_BREAK";
case __ILL_BNDMOD:
return "ILL_BNDMOD";
}
static_assert(NSIGILL == __ILL_BNDMOD, "missing ILL_* si_code");
break;
case SIGBUS:
switch (si->si_code) {
case BUS_ADRALN: return "BUS_ADRALN";
case BUS_ADRERR: return "BUS_ADRERR";
case BUS_OBJERR: return "BUS_OBJERR";
case BUS_MCEERR_AR: return "BUS_MCEERR_AR";
case BUS_MCEERR_AO: return "BUS_MCEERR_AO";
}
static_assert(NSIGBUS == BUS_MCEERR_AO, "missing BUS_* si_code");
break;
case SIGFPE:
switch (si->si_code) {
case FPE_INTDIV: return "FPE_INTDIV";
case FPE_INTOVF: return "FPE_INTOVF";
case FPE_FLTDIV: return "FPE_FLTDIV";
case FPE_FLTOVF: return "FPE_FLTOVF";
case FPE_FLTUND: return "FPE_FLTUND";
case FPE_FLTRES: return "FPE_FLTRES";
case FPE_FLTINV: return "FPE_FLTINV";
case FPE_FLTSUB: return "FPE_FLTSUB";
case __FPE_DECOVF:
return "FPE_DECOVF";
case __FPE_DECDIV:
return "FPE_DECDIV";
case __FPE_DECERR:
return "FPE_DECERR";
case __FPE_INVASC:
return "FPE_INVASC";
case __FPE_INVDEC:
return "FPE_INVDEC";
case FPE_FLTUNK:
return "FPE_FLTUNK";
case FPE_CONDTRAP:
return "FPE_CONDTRAP";
}
static_assert(NSIGFPE == FPE_CONDTRAP, "missing FPE_* si_code");
break;
case SIGSEGV:
switch (si->si_code) {
case SEGV_MAPERR: return "SEGV_MAPERR";
case SEGV_ACCERR: return "SEGV_ACCERR";
case SEGV_BNDERR: return "SEGV_BNDERR";
case SEGV_PKUERR: return "SEGV_PKUERR";
case SEGV_ACCADI:
return "SEGV_ACCADI";
case SEGV_ADIDERR:
return "SEGV_ADIDERR";
case SEGV_ADIPERR:
return "SEGV_ADIPERR";
case SEGV_MTEAERR:
return "SEGV_MTEAERR";
case SEGV_MTESERR:
return "SEGV_MTESERR";
}
static_assert(NSIGSEGV == SEGV_MTESERR, "missing SEGV_* si_code");
break;
case SIGSYS:
switch (si->si_code) {
case SYS_SECCOMP: return "SYS_SECCOMP";
case SYS_USER_DISPATCH:
return "SYS_USER_DISPATCH";
}
static_assert(NSIGSYS == SYS_USER_DISPATCH, "missing SYS_* si_code");
break;
case SIGTRAP:
switch (si->si_code) {
case TRAP_BRKPT: return "TRAP_BRKPT";
case TRAP_TRACE: return "TRAP_TRACE";
case TRAP_BRANCH: return "TRAP_BRANCH";
case TRAP_HWBKPT: return "TRAP_HWBKPT";
case TRAP_UNK:
return "TRAP_UNDIAGNOSED";
}
if ((si->si_code & 0xff) == SIGTRAP) {
switch ((si->si_code >> 8) & 0xff) {
case PTRACE_EVENT_FORK:
return "PTRACE_EVENT_FORK";
case PTRACE_EVENT_VFORK:
return "PTRACE_EVENT_VFORK";
case PTRACE_EVENT_CLONE:
return "PTRACE_EVENT_CLONE";
case PTRACE_EVENT_EXEC:
return "PTRACE_EVENT_EXEC";
case PTRACE_EVENT_VFORK_DONE:
return "PTRACE_EVENT_VFORK_DONE";
case PTRACE_EVENT_EXIT:
return "PTRACE_EVENT_EXIT";
case PTRACE_EVENT_SECCOMP:
return "PTRACE_EVENT_SECCOMP";
case PTRACE_EVENT_STOP:
return "PTRACE_EVENT_STOP";
}
}
static_assert(NSIGTRAP == TRAP_UNK, "missing TRAP_* si_code");
break;
}
switch (si->si_code) {
case SI_USER: return "SI_USER";
case SI_KERNEL: return "SI_KERNEL";
case SI_QUEUE: return "SI_QUEUE";
case SI_TIMER: return "SI_TIMER";
case SI_MESGQ: return "SI_MESGQ";
case SI_ASYNCIO: return "SI_ASYNCIO";
case SI_SIGIO: return "SI_SIGIO";
case SI_TKILL: return "SI_TKILL";
case SI_DETHREAD: return "SI_DETHREAD";
}
return "?";
}
log_backtrace
void log_backtrace(log_t* log, unwindstack::Unwinder* unwinder, const char* prefix) {
if (unwinder->elf_from_memory_not_file()) {
_LOG(log, logtype::BACKTRACE,
"%sNOTE: Function names and BuildId information is missing for some frames due\n", prefix);
_LOG(log, logtype::BACKTRACE,
"%sNOTE: to unreadable libraries. For unwinds of apps, only shared libraries\n", prefix);
_LOG(log, logtype::BACKTRACE, "%sNOTE: found under the lib/ directory are readable.\n", prefix);
#if defined(ROOT_POSSIBLE)
_LOG(log, logtype::BACKTRACE,
"%sNOTE: On this device, run setenforce 0 to make the libraries readable.\n", prefix);
#endif
}
unwinder->SetDisplayBuildID(true);
for (size_t i = 0; i < unwinder->NumFrames(); i++) {
_LOG(log, logtype::BACKTRACE, "%s%s\n", prefix, unwinder->FormatFrame(i).c_str());
}
}
Unwinder::FormatFrame
格式化输出一个 frame
std::string Unwinder::FormatFrame(const FrameData& frame) const {
std::string data;
if (ArchIs32Bit(arch_)) {
data += android::base::StringPrintf(" #%02zu pc %08" PRIx64, frame.num, frame.rel_pc);
} else {
data += android::base::StringPrintf(" #%02zu pc %016" PRIx64, frame.num, frame.rel_pc);
}
if (frame.map_start == frame.map_end) {
data += " <unknown>";
} else if (!frame.map_name.empty()) {
data += " ";
data += frame.map_name;
} else {
data += android::base::StringPrintf(" <anonymous:%" PRIx64 ">", frame.map_start);
}
if (frame.map_elf_start_offset != 0) {
data += android::base::StringPrintf(" (offset 0x%" PRIx64 ")", frame.map_elf_start_offset);
}
if (!frame.function_name.empty()) {
char* demangled_name = __cxa_demangle(frame.function_name.c_str(), nullptr, nullptr, nullptr);
if (demangled_name == nullptr) {
data += " (";
data += frame.function_name;
} else {
data += " (";
data += demangled_name;
free(demangled_name);
}
if (frame.function_offset != 0) {
data += android::base::StringPrintf("+%" PRId64, frame.function_offset);
}
data += ')';
}
MapInfo* map_info = maps_->Find(frame.map_start);
if (map_info != nullptr && display_build_id_) {
std::string build_id = map_info->GetPrintableBuildID();
if (!build_id.empty()) {
data += " (BuildId: " + build_id + ')';
}
}
return data;
}
dump_memory_and_code
打印内存信息
void dump_memory_and_code(log_t* log, unwindstack::Maps* maps, unwindstack::Memory* memory,
unwindstack::Regs* regs) {
regs->IterateRegisters([log, maps, memory](const char* reg_name, uint64_t reg_value) {
std::string label{"memory near "s + reg_name};
if (maps) {
unwindstack::MapInfo* map_info = maps->Find(untag_address(reg_value));
if (map_info != nullptr && !map_info->name().empty()) {
label += " (" + map_info->name() + ")";
}
}
dump_memory(log, memory, reg_value, label);
});
}
dump_memory 打印内存信息
void dump_memory(log_t* log, unwindstack::Memory* memory, uint64_t addr, const std::string& label) {
uintptr_t data[MEMORY_BYTES_TO_DUMP / sizeof(uintptr_t)];
uint8_t tags[MEMORY_BYTES_TO_DUMP / kTagGranuleSize];
ssize_t bytes = dump_memory(data, sizeof(data), tags, sizeof(tags), &addr, memory);
if (bytes == -1) {
return;
}
_LOG(log, logtype::MEMORY, "\n%s:\n", label.c_str());
uintptr_t* data_ptr = data;
uint8_t* tags_ptr = tags;
for (size_t line = 0; line < static_cast<size_t>(bytes) / MEMORY_BYTES_PER_LINE; line++) {
uint64_t tagged_addr = addr | static_cast<uint64_t>(*tags_ptr++) << 56;
std::string logline;
android::base::StringAppendF(&logline, " %" PRIPTR, tagged_addr);
addr += MEMORY_BYTES_PER_LINE;
std::string ascii;
for (size_t i = 0; i < MEMORY_BYTES_PER_LINE / sizeof(uintptr_t); i++) {
android::base::StringAppendF(&logline, " %" PRIPTR, static_cast<uint64_t>(*data_ptr));
uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr);
for (size_t val = 0; val < sizeof(uintptr_t); val++, ptr++) {
if (*ptr >= 0x20 && *ptr < 0x7f) {
ascii += *ptr;
} else {
ascii += '.';
}
}
data_ptr++;
}
_LOG(log, logtype::MEMORY, "%s %s\n", logline.c_str(), ascii.c_str());
}
}
dump 完成通知
int main(int argc, char** argv) {
...
close(STDOUT_FILENO);
if (g_tombstoned_connected && !tombstoned_notify_completion(g_tombstoned_socket.get())) {
LOG(ERROR) << "failed to notify tombstoned of completion";
}
}
tombstoned_notify_completion
bool tombstoned_notify_completion(int tombstoned_socket) {
TombstonedCrashPacket packet = {};
packet.packet_type = CrashPacketType::kCompletedDump;
if (TEMP_FAILURE_RETRY(write(tombstoned_socket, &packet, sizeof(packet))) != sizeof(packet)) {
return false;
}
return true;
}
在之前perform_request流程知道,注册了一个complete监听,当收到请求时会回调 crash_completed_cb
crash_completed_cb
static void crash_completed_cb(evutil_socket_t sockfd, short ev, void* arg) {
std::unique_ptr<Crash> crash(static_cast<Crash*>(arg));
CrashQueue* queue = CrashQueue::for_crash(crash);
queue->on_crash_completed();
if ((ev & EV_READ) == EV_READ) {
crash_completed(sockfd, std::move(crash));
}
queue->maybe_dequeue_crashes(perform_request);
}
crash_completed
tombstoned 处理完成的逻辑主要在这个函数
static void crash_completed(borrowed_fd sockfd, std::unique_ptr<Crash> crash) {
TombstonedCrashPacket request = {};
CrashQueue* queue = CrashQueue::for_crash(crash);
ssize_t rc = TEMP_FAILURE_RETRY(read(sockfd.get(), &request, sizeof(request)));
if (rc == -1) {
PLOG(WARNING) << "failed to read from crash socket";
return;
} else if (rc != sizeof(request)) {
LOG(WARNING) << "crash socket received short read of length " << rc << " (expected "
<< sizeof(request) << ")";
return;
}
if (request.packet_type != CrashPacketType::kCompletedDump) {
LOG(WARNING) << "unexpected crash packet type, expected kCompletedDump, received "
<< uint32_t(request.packet_type);
return;
}
if (crash->output.text.fd == -1) {
LOG(WARNING) << "missing output fd";
return;
}
CrashArtifactPaths paths = queue->get_next_artifact_paths();
if (rename_tombstone_fd(crash->output.text.fd, queue->dir_fd(), paths.text)) {
if (crash->crash_type == kDebuggerdJavaBacktrace) {
LOG(ERROR) << "Traces for pid " << crash->crash_pid << " written to: " << paths.text;
} else {
LOG(ERROR) << "Tombstone written to: " << paths.text;
}
}
if (crash->output.proto && crash->output.proto->fd != -1) {
if (!paths.proto) {
LOG(ERROR) << "missing path for proto tombstone";
} else {
rename_tombstone_fd(crash->output.proto->fd, queue->dir_fd(), *paths.proto);
}
}
if (crash->output.text.temporary_path) {
rc = unlinkat(queue->dir_fd().get(), crash->output.text.temporary_path->c_str(), 0);
if (rc != 0) {
PLOG(ERROR) << "failed to unlink temporary tombstone at " << paths.text;
}
}
if (crash->output.proto && crash->output.proto->temporary_path) {
rc = unlinkat(queue->dir_fd().get(), crash->output.proto->temporary_path->c_str(), 0);
if (rc != 0) {
PLOG(ERROR) << "failed to unlink temporary proto tombstone";
}
}
}
CrashQueue::get_next_artifact_paths
返回的名称为 tombstone_xx
CrashArtifactPaths get_next_artifact_paths() {
CrashArtifactPaths result;
result.text = StringPrintf("%s%02d", file_name_prefix_.c_str(), next_artifact_);
if (supports_proto_) {
result.proto = StringPrintf("%s%02d.pb", file_name_prefix_.c_str(), next_artifact_);
}
next_artifact_ = (next_artifact_ + 1) % max_artifacts_;
return result;
}
对于 tombstone的定义如下
static CrashQueue queue("/data/tombstones", "tombstone_" ,
GetIntProperty("tombstoned.max_tombstone_count", 32),
1 , true );
简单看下 rename_tombstone_fd 实现
static bool rename_tombstone_fd(borrowed_fd fd, borrowed_fd dirfd, const std::string& path) {
int rc = unlinkat(dirfd.get(), path.c_str(), 0);
if (rc != 0 && errno != ENOENT) {
PLOG(ERROR) << "failed to unlink tombstone at " << path;
return false;
}
std::string fd_path = StringPrintf("/proc/self/fd/%d", fd.get());
rc = linkat(AT_FDCWD, fd_path.c_str(), dirfd.get(), path.c_str(), AT_SYMLINK_FOLLOW);
if (rc != 0) {
PLOG(ERROR) << "failed to link tombstone at " << path;
return false;
}
return true;
}
下面看看AMS对crash_dump上报的crash处理逻辑。
系统监听 Native crash
SystemServer#startOtherServices
private void startOtherServices(@NonNull TimingsTraceAndSlog t) {
...
mActivityManagerService.systemReady(() -> {
Slog.i(TAG, "Making services ready");
t.traceBegin("StartActivityManagerReadyPhase");
mSystemServiceManager.startBootPhase(t, SystemService.PHASE_ACTIVITY_MANAGER_READY);
t.traceEnd();
t.traceBegin("StartObservingNativeCrashes");
try {
mActivityManagerService.startObservingNativeCrashes();
} catch (Throwable e) {
}
AMS#startObservingNativeCrashes
public void startObservingNativeCrashes() {
final NativeCrashListener ncl = new NativeCrashListener(this);
ncl.start();
}
NativeCrashListener#run
run 方法是其实现的地方,建立了一个 socket server 端,以供 crash_dump 来连接并反馈
@Override
public void run() {
final byte[] ackSignal = new byte[1];
if (DEBUG) Slog.i(TAG, "Starting up");
{
File socketFile = new File(DEBUGGERD_SOCKET_PATH);
if (socketFile.exists()) {
socketFile.delete();
}
}
try {
FileDescriptor serverFd = Os.socket(AF_UNIX, SOCK_STREAM, 0);
final UnixSocketAddress sockAddr = UnixSocketAddress.createFileSystem(
DEBUGGERD_SOCKET_PATH);
Os.bind(serverFd, sockAddr);
Os.listen(serverFd, 1);
Os.chmod(DEBUGGERD_SOCKET_PATH, 0777);
while (true) {
FileDescriptor peerFd = null;
try {
if (MORE_DEBUG) Slog.v(TAG, "Waiting for debuggerd connection");
peerFd = Os.accept(serverFd, null );
if (MORE_DEBUG) Slog.v(TAG, "Got debuggerd socket " + peerFd);
if (peerFd != null) {
consumeNativeCrashData(peerFd);
}
} catch (Exception e) {
Slog.w(TAG, "Error handling connection", e);
} finally {
if (peerFd != null) {
try {
Os.write(peerFd, ackSignal, 0, 1);
} catch (Exception e) {
if (MORE_DEBUG) {
Slog.d(TAG, "Exception writing ack: " + e.getMessage());
}
}
try {
Os.close(peerFd);
} catch (ErrnoException e) {
if (MORE_DEBUG) {
Slog.d(TAG, "Exception closing socket: " + e.getMessage());
}
}
}
}
}
} catch (Exception e) {
Slog.e(TAG, "Unable to init native debug socket!", e);
}
}
NativeCrashListener#consumeNativeCrashData
接收、解析crash数据,并向AMS上报crash
void consumeNativeCrashData(FileDescriptor fd) {
if (MORE_DEBUG) Slog.i(TAG, "debuggerd connected");
final byte[] buf = new byte[4096];
final ByteArrayOutputStream os = new ByteArrayOutputStream(4096);
try {
StructTimeval timeout = StructTimeval.fromMillis(SOCKET_TIMEOUT_MILLIS);
Os.setsockoptTimeval(fd, SOL_SOCKET, SO_RCVTIMEO, timeout);
Os.setsockoptTimeval(fd, SOL_SOCKET, SO_SNDTIMEO, timeout);
int headerBytes = readExactly(fd, buf, 0, 8);
if (headerBytes != 8) {
Slog.e(TAG, "Unable to read from debuggerd");
return;
}
int pid = unpackInt(buf, 0);
int signal = unpackInt(buf, 4);
if (DEBUG) {
Slog.v(TAG, "Read pid=" + pid + " signal=" + signal);
}
if (pid > 0) {
final ProcessRecord pr;
synchronized (mAm.mPidsSelfLocked) {
pr = mAm.mPidsSelfLocked.get(pid);
}
if (pr != null) {
if (pr.isPersistent()) {
if (DEBUG) {
Slog.v(TAG, "Skipping report for persistent app " + pr);
}
return;
}
int bytes;
do {
bytes = Os.read(fd, buf, 0, buf.length);
if (bytes > 0) {
if (MORE_DEBUG) {
String s = new String(buf, 0, bytes, "UTF-8");
Slog.v(TAG, "READ=" + bytes + "> " + s);
}
if (buf[bytes-1] == 0) {
os.write(buf, 0, bytes-1);
break;
}
os.write(buf, 0, bytes);
}
} while (bytes > 0);
if (DEBUG) Slog.v(TAG, "processing");
synchronized (mAm) {
synchronized (mAm.mProcLock) {
pr.mErrorState.setCrashing(true);
pr.mErrorState.setForceCrashReport(true);
}
}
final String reportString = new String(os.toByteArray(), "UTF-8");
(new NativeCrashReporter(pr, signal, reportString)).start();
} else {
Slog.w(TAG, "Couldn't find ProcessRecord for pid " + pid);
}
} else {
Slog.e(TAG, "Bogus pid!");
}
} catch (Exception e) {
Slog.e(TAG, "Exception dealing with report", e);
}
}
NativeCrashReporter#run
NativeCrashReporter是继承Thread类,用来异步上报crash事件
@Override
public void run() {
try {
CrashInfo ci = new CrashInfo();
ci.exceptionClassName = "Native crash";
ci.exceptionMessage = Os.strsignal(mSignal);
ci.throwFileName = "unknown";
ci.throwClassName = "unknown";
ci.throwMethodName = "unknown";
ci.stackTrace = mCrashReport;
if (DEBUG) Slog.v(TAG, "Calling handleApplicationCrash()");
mAm.handleApplicationCrashInner("native_crash", mApp, mApp.processName, ci);
if (DEBUG) Slog.v(TAG, "<-- handleApplicationCrash() returned");
} catch (Exception e) {
Slog.e(TAG, "Unable to report native crash", e);
}
}
参考:
https://zhuanlan.zhihu.com/p/372073135 http://gityuan.com/2016/06/25/android-native-crash/
|