Xcrash是怎么捕获Crash
一、Xcrash简介
Xcrash是爱奇艺在2019年4月开源在GitHub上的稳定性日志收集框架,它能为android收集java crash、native crash、anr日志。不需要root权限和系统权限。支持 Android 4.0 - 10(API level 14 - 29),支持 armeabi,armeabi-v7a,arm64-v8a,x86 和 x86_64。
项目地址:github.com/iqiyi/xCras…
二、Xcrash架构
三、Xcrash类图
xcrash作为门面模式的入口,client调用通过配置InitParameter来进行初始化。Xcrash分别关联三种类型Handler来处理对应的奔溃监听和日志收集,通过FileManager和TombstoneManager对奔溃日志进行tombstone文件管理。client调用TombstoneParser来解析本地生成的对应tombstone文件,获取数据。
四、捕获Java奔溃
Java层的崩溃可以直接交给JVM的崩溃捕获机制去处理。这个非常简单,不赘述。
1Thread.setDefaultUncaughtExceptionHandler(this);
2复制代码
如果有java crash发生,会回调uncaughtException,执行handleException收集相关log信息
1private void handleException(Thread thread, Throwable throwable) {
2...
3 //notify the java crash
4 NativeHandler.getInstance().notifyJavaCrashed();
5 AnrHandler.getInstance().notifyJavaCrashed();
6 //create log file data/data/packageName/files/tombstones
7 logFile = FileManager.getInstance().createLogFile(logPath);
8 ...
9 //write info to log file
10 if (logFile != null) {
11…
12 // write java stacktrace
13 raf.write(emergency.getBytes("UTF-8"));
14
15 //write logcat日志 logcat -b main;logcat -b system; logcat -b event;
16 raf.write(Util.getLogcat(logcatMainLines, logcatSystemLines, logcatEventsLines).getBytes("UTF-8"));
17
18 //write fds
19 raf.write(Util.getFds().getBytes("UTF-8"));
20
21 //write network info
22 raf.write(Util.getNetworkInfo().getBytes("UTF-8"));
23
24 //write memory info
25 raf.write(Util.getMemoryInfo().getBytes("UTF-8"));
26
27 //write background / foreground
28 raf.write(("foreground:\n" + (ActivityMonitor.getInstance().isApplicationForeground() ? "yes" : "no") + "\n\n").getBytes("UTF-8"));
29
30 //write other threads info
31 if (dumpAllThreads) {
32 raf.write(getOtherThreadsInfo(thread).getBytes("UTF-8"));
33 }
34 }
35
36 //callback 回调ICrashCallback onCrash
37 if (callback != null) {
38 try {
39 callback.onCrash(logFile == null ? null : logFile.getAbsolutePath(), emergency);
40 } catch (Exception ignored) {
41 }
42 }
43}
五、捕获Native奔溃
Crash.java
1public static synchronized int init(Context ctx, InitParameters params) {
2…
3 NativeHandler.getInstance().initialize(...)
4...
5}
6复制代码
NativeHandler.java
1int initialize(...) {
2 //load lib
3 System.loadLibrary("xcrash");
4 ...
5 //init native lib
6 try {
7 int r = nativeInit(...);
8 }
9...
10}
11复制代码
NativeHandler在Xcrash init时会执行initialize方法进行初始化,初始化过程首先通过System.loadLibrary("xcrash”)注册native函数,其次就是调用nativeInit。
执行System.loadLibrary("xcrash”),JNI_OnLoad会被回调,这里是动态注册玩法。
xc_jni.c
1JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved)
2{
3...
4 if((*env)->RegisterNatives(env, cls, xc_jni_methods, sizeof(xc_jni_methods) / sizeof(xc_jni_methods[0]))) return -1;
5...
6 return XC_JNI_VERSION;
7}
8复制代码
数组0元素对应:
1static JNINativeMethod xc_jni_methods[] = {
2 {
3 "nativeInit",
4 "("
5 "I"
6 "Ljava/lang/String;"
7 "Ljava/lang/String;"
8 "Ljava/lang/String;"
9 "Ljava/lang/String;"
10 "Ljava/lang/String;"
11 "Ljava/lang/String;"
12 "Ljava/lang/String;"
13 "Ljava/lang/String;"
14 "Ljava/lang/String;"
15 "Ljava/lang/String;"
16 "Z"
17 "Z"
18 "I"
19 "I"
20 "I"
21 "Z"
22 "Z"
23 "Z"
24 "Z"
25 "Z"
26 "I"
27 "[Ljava/lang/String;"
28 "Z"
29 "Z"
30 "I"
31 "I"
32 "I"
33 "Z"
34 "Z"
35 ")"
36 "I",
37 (void *)xc_jni_init
38 },
39…
40}
41复制代码
java层调用nativeInit,native xc_jni_init会被调用。接着看nativeInit逻辑 xc_jni.c
1static jint xc_jni_init(...)
2{
3...
4 //common init
5 xc_common_init(…);//通用信息初始化,包括系统信息、应用信息、进程信息等。
6...
7 //crash init 捕获crash日志
8 r_crash = xc_crash_init(…);
9...
10 //trace init 捕获anr日志
11 r_trace = xc_trace_init(...);
12 }
13...
14 return (0 == r_crash && 0 == r_trace) ? 0 : XCC_ERRNO_JNI;
15}
16复制代码
先看xc_crash_init
1int xc_crash_init(){
2 …
3 //init for JNI callback
4 xc_crash_init_callback(env);//1设置信号native 信号回调 jni到java
5 …
6 //register signal handler
7 return xcc_signal_crash_register(xc_crash_signal_handler);//2注册信号handler,能回调处理对应的信号
8}
9复制代码
1)设置callback:
xc_crash_init_callback最终回调的是NativeHandler的crashCallback
1private static void crashCallback(String logPath, String emergency, boolean dumpJavaStacktrace, boolean isMainThread, String threadName) {
2 if (!TextUtils.isEmpty(logPath)) {
3 //append java stacktrace
4 TombstoneManager.appendSection(logPath, "java stacktrace", stacktrace);
5 ...
6 //append memory info
7 TombstoneManager.appendSection(logPath, "memory info", Util.getProcessMemoryInfo());
8 //append background / foreground
9 TombstoneManager.appendSection(logPath, "foreground", ActivityMonitor.getInstance().isApplicationForeground() ? "yes" : "no");
10 }
11
12 //最后回调到client注册的ICrashCallback.onCrash
13 ICrashCallback callback = NativeHandler.getInstance().crashCallback;
14 if (callback != null) {
15 callback.onCrash(logPath, emergency);
16 }
17...
18}
19复制代码
2)信号注册:
1static xcc_signal_crash_info_t xcc_signal_crash_info[] =
2{
3 {.signum = SIGABRT},//调用abort函数生成的信号,表示程序异常
4 {.signum = SIGBUS},// 非法地址,包括内存地址对齐出错
5 {.signum = SIGFPE},// 计算错误,比如除0、溢出
6 {.signum = SIGILL},// 强制结束程序
7 {.signum = SIGSEGV},// 非法内存操作
8 {.signum = SIGTRAP},// 断点时产生,由debugger使用
9 {.signum = SIGSYS},// 非法的系统调用
10 {.signum = SIGSTKFLT}// 协处理器堆栈错误
11};
12
13int xcc_signal_crash_register(void (*handler)(int, siginfo_t *, void *))
14{
15 stack_t ss;
16 if(NULL == (ss.ss_sp = calloc(1, XCC_SIGNAL_CRASH_STACK_SIZE))) return XCC_ERRNO_NOMEM;
17 ss.ss_size = XCC_SIGNAL_CRASH_STACK_SIZE;
18 ss.ss_flags = 0;
19 if(0 != sigaltstack(&ss, NULL)) return XCC_ERRNO_SYS;
20 struct sigaction act;
21 memset(&act, 0, sizeof(act));
22 sigfillset(&act.sa_mask);
23 act.sa_sigaction = handler;//设置信号回调handler
24 act.sa_flags = SA_RESTART | SA_SIGINFO | SA_ONSTACK;
25 size_t i;
26 //通过sigaction注册上述信号组
27 for(i = 0; i < sizeof(xcc_signal_crash_info) / sizeof(xcc_signal_crash_info[0]); i++)
28 if(0 != sigaction(xcc_signal_crash_info[i].signum, &act, &(xcc_signal_crash_info[i].oldact)))
29 return XCC_ERRNO_SYS;
30 return 0;
31}
32复制代码
注册的是指针函数:xc_crash_signal_handler,追过去看看:
1static void xc_crash_signal_handler(int sig, siginfo_t *si, void *uc
2{
3 ...
4 //create and open log file 打开log文件
5 if((xc_crash_log_fd = xc_common_open_crash_log(xc_crash_log_pathname, sizeof(xc_crash_log_pathname), &xc_crash_log_from_placeholder)) < 0) goto end;
6...
7 //spawn crash dumper process 起一个进程来处理dump
8 pid_t dumper_pid = xc_crash_fork(xc_crash_exec_dumper);
9...
10 //JNI callback 完成之后jni到java的callback回调
11 xc_crash_callback();
12...
13}
14复制代码
进入xc_crash_exec_dumper指针函数,看看进程dump操作:
1static int xc_crash_exec_dumper(void *arg)
2{
3 …
4 //这里执行的是#define XCC_UTIL_XCRASH_DUMPER_FILENAME "libxcrash_dumper.so"
5 execl(xc_crash_dumper_pathname, XCC_UTIL_XCRASH_DUMPER_FILENAME, NULL);
6 return 100 + errno;
7}
8复制代码
这个部分是做各种数据的dump。简单找下main方法:
xcd_core.c
1int main(int argc, char** argv)
2{
3...
4 //read args from stdin
5 if(0 != xcd_core_read_args()) exit(1);
6 //open log file
7 if(0 > (xcd_core_log_fd = XCC_UTIL_TEMP_FAILURE_RETRY(open(xcd_core_log_pathname, O_WRONLY | O_CLOEXEC)))) exit(2);
8 //register signal handler for catching self-crashing
9 xcc_unwind_init(xcd_core_spot.api_level);
10 xcc_signal_crash_register(xcd_core_signal_handler);
11 //create process object
12 if(0 != xcd_process_create())) exit(3);
13 //suspend all threads in the process
14 xcd_process_suspend_threads(xcd_core_proc);
15 //load process info
16 if(0 != xcd_process_load_info(xcd_core_proc)) exit(4);
17 //record system info
18 if(0 != xcd_sys_record(...)) exit(5);
19 //record process info
20 if(0 != xcd_process_record(...)) exit(6);
21 //resume all threads in the process
22 xcd_process_resume_threads(xcd_core_proc);
23...
24}
25复制代码
不细看了,整个过程先是挂起crash进程的所以线程,然后收集相关log,最后resume所有线程。
xc_trace_init部分不分析了,与xc_jni_init分析方法一致。这里也就简单分析了个大脉络。
Native崩溃处理步骤总结:
注册信号处理函数(signal handler)。
崩溃发生时创建子进程收集信息(避免在崩溃进程调用函数的系统限制)。
suspend崩溃进程中所有的线程,暂停logcat输出,收集logcat。
收集backtrace等信息。
收集内存数据。
完成后恢复线程。
六、捕获ANR
同样在Xcrash init时初始化
Crash.java
1public static synchronized int init(Context ctx, InitParameters params) {
2//init ANR handler (API level < 21)
3if (params.enableAnrHandler && Build.VERSION.SDK_INT < 21) {
4 AnrHandler.getInstance().initialize(...);
5 }
6}
7复制代码
这里有个限制,是sdk <21的版本才抓取。
AnrHandler.java
1void initialize(Context ctx, int pid, String processName, String appId, String appVersion, String logDir,
2 boolean checkProcessState, int logcatSystemLines, int logcatEventsLines, int logcatMainLines,
3 boolean dumpFds, boolean dumpNetworkInfo, ICrashCallback callback) {
4 //check API level
5 if (Build.VERSION.SDK_INT >= 21) {
6 return;
7 }
8...
9 //FileObserver是用来监控文件系统,这里监听/data/anr/trace.txt
10 fileObserver = new FileObserver("/data/anr/", CLOSE_WRITE) {
11 public void onEvent(int event, String path) {
12 try {
13 if (path != null) {
14 String filepath = "/data/anr/" + path;
15 if (filepath.contains("trace")) {
16 //监听回调,处理anr
17 handleAnr(filepath);
18 }
19 }
20 } catch (Exception e) {
21 XCrash.getLogger().e(Util.TAG, "AnrHandler fileObserver onEvent failed", e);
22 }
23 }
24 };
25
26 try {
27 //启动监听
28 fileObserver.startWatching();
29 } catch (Exception e) {
30 fileObserver = null;
31 XCrash.getLogger().e(Util.TAG, "AnrHandler fileObserver startWatching failed", e);
32 }
33}
34复制代码
高版本系统已经没有读取/data/anr/的权限了,因此FileObserver监听/data/anr/的方案只能支持<21的版本,而目前xcrash对>21的版本无法获取anr日志。
然后看看handleAnr收集了哪些数据:
1private void handleAnr(String filepath) {
2 Date anrTime = new Date();
3 //check ANR time interval
4 if (anrTime.getTime() - lastTime < anrTimeoutMs) {
5 return;
6 }
7
8 //check process error state
9 if (this.checkProcessState) {
10 if (!Util.checkProcessAnrState(this.ctx, anrTimeoutMs)) {
11 return;
12 }
13 }
14 //create log file
15 logFile = FileManager.getInstance().createLogFile(logPath);
16
17 //write info to log file
18 //write emergency info
19 raf.write(emergency.getBytes("UTF-8"));
20
21 //write logcat
22 raf.write(Util.getLogcat(logcatMainLines, logcatSystemLines, logcatEventsLines).getBytes("UTF-8"));
23
24 //write fds
25 raf.write(Util.getFds().getBytes("UTF-8"));
26
27 //write network info
28 raf.write(Util.getNetworkInfo().getBytes("UTF-8"));
29
30 //write memory info
31 raf.write(Util.getMemoryInfo().getBytes("UTF-8"));
32
33 //callback
34 if (callback != null) {
35 try {
36 callback.onCrash(logFile == null ? null : logFile.getAbsolutePath(), emergency);
37 } catch (Exception ignored) {
38 }
39 }
40}
41复制代码
这里重点关注checkProcessAnrState,它是AMS对外暴露的api,从AMS的mLruProcesses中过滤出crash和anr异常的进程,返回对应的错误信息。补充cause reason部分,也就是ANR in。
1 static boolean checkProcessAnrState(Context ctx, long timeoutMs) {
2 ActivityManager am = (ActivityManager) ctx.getSystemService(Context.ACTIVITY_SERVICE);
3 if (am == null) return false;
4
5 int pid = android.os.Process.myPid();
6 long poll = timeoutMs / 500;
7 for (int i = 0; i < poll; i++) {
8 List<ActivityManager.ProcessErrorStateInfo> processErrorList = am.getProcessesInErrorState();
9 if (processErrorList != null) {
10 for (ActivityManager.ProcessErrorStateInfo errorStateInfo : processErrorList) {
11 if (errorStateInfo.pid == pid && errorStateInfo.condition == ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {
12 return true;
13 }
14 }
15 }
16
17 try {
18 Thread.sleep(500);
19 } catch (Exception ignored) {
20 }
21 }
22
23 return false;
24 }
25复制代码
那么>21版本的anr如何抓取?//init native crash handler / ANR handler (API level >= 21) int r = Errno.OK; if (params.enableNativeCrashHandler || (params.enableAnrHandler && Build.VERSION.SDK_INT >= 21)) { r = NativeHandler.getInstance().initialize(…); } 是通过nativeHandler来抓的。也就是前面提到的
1//trace init 捕获anr日志
2r_trace = xc_trace_init(...);
3复制代码
它是native 注册 SIGNAL_QUIT 信号,ANR发生时接收回调去收集ANR信息。
1int xc_trace_init(...)
2{
3 int r;
4 pthread_t thd;
5
6 //capture SIGQUIT only for ART
7 if(xc_common_api_level < 21) return 0;
8...
9 //init for JNI callback
10 xc_trace_init_callback(env);
11
12 //create event FD
13 if(0 > (xc_trace_notifier = eventfd(0, EFD_CLOEXEC))) return XCC_ERRNO_SYS;
14
15 //register signal handler
16 if(0 != (r = xcc_signal_trace_register(xc_trace_handler))) goto err2;
17
18 //create thread for dump trace
19 if(0 != (r = pthread_create(&thd, NULL, xc_trace_dumper, NULL))) goto err1;
20 ...
21 return r;
22}
23复制代码
这里xc_trace_notifier是一个eventfd ,在handler接收信号回调时被写
1static void xc_trace_handler(int sig, siginfo_t *si, void *uc)
2{
3 uint64_t data;
4
5 (void)sig;
6 (void)si;
7 (void)uc;
8
9 if(xc_trace_notifier >= 0)
10 {
11 data = 1;
12 XCC_UTIL_TEMP_FAILURE_RETRY(write(xc_trace_notifier, &data, sizeof(data)));
13 }
14}
15复制代码
然后xc_trace_dumper线程会解除阻塞状态开始执行dump任务。
1static void *xc_trace_dumper(void *arg)
2{
3 JNIEnv *env = NULL;
4 uint64_t data;
5 uint64_t trace_time;
6 int fd;
7 struct timeval tv;
8 char pathname[1024];
9 jstring j_pathname;
10
11 (void)arg;
12
13 pthread_detach(pthread_self());
14
15 JavaVMAttachArgs attach_args = {
16 .version = XC_JNI_VERSION,
17 .name = "xcrash_trace_dp",
18 .group = NULL
19 };
20 if(JNI_OK != (*xc_common_vm)->AttachCurrentThread(xc_common_vm, &env, &attach_args)) goto exit;
21
22 while(1)
23 {
24 //block here, waiting for sigquit
25 XCC_UTIL_TEMP_FAILURE_RETRY(read(xc_trace_notifier, &data, sizeof(data)));
26
27 //check if process already crashed
28 if(xc_common_native_crashed || xc_common_java_crashed) break;
29
30 //trace time
31 if(0 != gettimeofday(&tv, NULL)) break;
32 trace_time = (uint64_t)(tv.tv_sec) * 1000 * 1000 + (uint64_t)tv.tv_usec;
33
34 //Keep only one current trace.
35 if(0 != xc_trace_logs_clean()) continue;
36
37 //create and open log file
38 if((fd = xc_common_open_trace_log(pathname, sizeof(pathname), trace_time)) < 0) continue;
39
40 //write header info
41 if(0 != xc_trace_write_header(fd, trace_time)) goto end;
42
43 //write trace info from ART runtime
44 if(0 != xcc_util_write_format(fd, XCC_UTIL_THREAD_SEP"Cmd line: %s\n", xc_common_process_name)) goto end;
45 if(0 != xcc_util_write_str(fd, "Mode: ART DumpForSigQuit\n")) goto end;
46 if(0 != xc_trace_load_symbols())
47 {
48 if(0 != xcc_util_write_str(fd, "Failed to load symbols.\n")) goto end;
49 goto skip;
50 }
51 if(0 != xc_trace_check_address_valid())
52 {
53 if(0 != xcc_util_write_str(fd, "Failed to check runtime address.\n")) goto end;
54 goto skip;
55 }
56 if(dup2(fd, STDERR_FILENO) < 0)
57 {
58 if(0 != xcc_util_write_str(fd, "Failed to duplicate FD.\n")) goto end;
59 goto skip;
60 }
61
62 xc_trace_dump_status = XC_TRACE_DUMP_ON_GOING;
63 if(sigsetjmp(jmpenv, 1) == 0)
64 {
65 if(xc_trace_is_lollipop)
66 xc_trace_libart_dbg_suspend();
67 xc_trace_libart_runtime_dump(*xc_trace_libart_runtime_instance, xc_trace_libcpp_cerr);
68 if(xc_trace_is_lollipop)
69 xc_trace_libart_dbg_resume();
70 }
71 else
72 {
73 fflush(NULL);
74 XCD_LOG_WARN("longjmp to skip dumping trace\n");
75 }
76
77 dup2(xc_common_fd_null, STDERR_FILENO);
78
79 skip:
80 if(0 != xcc_util_write_str(fd, "\n"XCC_UTIL_THREAD_END"\n")) goto end;
81
82 //write other info
83 if(0 != xcc_util_record_logcat(fd, xc_common_process_id, xc_common_api_level, xc_trace_logcat_system_lines, xc_trace_logcat_events_lines, xc_trace_logcat_main_lines)) goto end;
84 if(xc_trace_dump_fds)
85 if(0 != xcc_util_record_fds(fd, xc_common_process_id)) goto end;
86 if(xc_trace_dump_network_info)
87 if(0 != xcc_util_record_network_info(fd, xc_common_process_id, xc_common_api_level)) goto end;
88 if(0 != xcc_meminfo_record(fd, xc_common_process_id)) goto end;
89
90 end:
91 //close log file
92 xc_common_close_trace_log(fd);
93
94 //rethrow SIGQUIT to ART Signal Catcher
95 if(xc_trace_rethrow && (XC_TRACE_DUMP_ART_CRASH != xc_trace_dump_status)) xc_trace_send_sigquit();
96 xc_trace_dump_status = XC_TRACE_DUMP_END;
97
98 //JNI callback
99 //Do we need to implement an emergency buffer for disk exhausted?
100 if(NULL == xc_trace_cb_method) continue;
101 if(NULL == (j_pathname = (*env)->NewStringUTF(env, pathname))) continue;
102 (*env)->CallStaticVoidMethod(env, xc_common_cb_class, xc_trace_cb_method, j_pathname, NULL);
103 XC_JNI_IGNORE_PENDING_EXCEPTION();
104 (*env)->DeleteLocalRef(env, j_pathname);
105 }
106
107 (*xc_common_vm)->DetachCurrentThread(xc_common_vm);
108
109 exit:
110 xc_trace_notifier = -1;
111 close(xc_trace_notifier);
112 return NULL;
113}
作者:Stan_Z
链接:https://juejin.cn/post/6991356414069309477
【精彩阅读】
Android逆向之Magisk+Edxposed刷入教程(内附资源)
Gradle Plugin+Transform+ASM Hook并替换隐私方法调用(彻底解决隐私不合规问题)