一 ANR场景
无论是四大组件或者进程等只要发生 ANR,最终都会调用 AMS.appNotResponding() 方法,下面从这个方法说起。
以下场景都会触发调用 AMS.appNotResponding 方法:
- Service Timeout:比如前台服务在 20s 内未执行完成;
- BroadcastQueue Timeout:比如前台广播在 10s 内未执行完成
- InputDispatching Timeout:输入事件分发超时 5s,包括按键和触摸事件。
二 appNotResponding处理流程
2.1 AMS.appNotResponding
public void appNotResponding(final String reason) {
final int callingPid = Binder.getCallingPid();
synchronized (mPidsSelfLocked) {
final ProcessRecord app = mPidsSelfLocked.get(callingPid);
if (app == null) {
throw new SecurityException("Unknown process: " + callingPid);
}
mAnrHelper.appNotResponding(app, null, app.info, null, null, false,
"App requested: " + reason);
}
}
2.2 AnrHelper.appNotResponding
void appNotResponding(ProcessRecord anrProcess, String activityShortComponentName,
ApplicationInfo aInfo, String parentShortComponentName,
WindowProcessController parentProcess, boolean aboveSystem, String annotation) {
synchronized (mAnrRecords) {
mAnrRecords.add(new AnrRecord(anrProcess, activityShortComponentName, aInfo,
parentShortComponentName, parentProcess, aboveSystem, annotation));
}
startAnrConsumerIfNeeded();
}
2.3 AnrHelper.appNotResponding::startAnrConsumerIfNeeded
private void startAnrConsumerIfNeeded() {
if (mRunning.compareAndSet(false, true)) {
new AnrConsumerThread().start();
}
}
private class AnrConsumerThread extends Thread {
AnrConsumerThread() {
super("AnrConsumer");
}
private AnrRecord next() {
synchronized (mAnrRecords) {
return mAnrRecords.isEmpty() ? null : mAnrRecords.remove(0);
}
}
@Override
public void run() {
AnrRecord r;
while ((r = next()) != null) {
scheduleBinderHeavyHitterAutoSamplerIfNecessary();
final long startTime = SystemClock.uptimeMillis();
// If there are many ANR at the same time, the latency may be larger. If the latency
// is too large, the stack trace might not be meaningful.
final long reportLatency = startTime - r.mTimestamp;
final boolean onlyDumpSelf = reportLatency > EXPIRED_REPORT_TIME_MS;
r.appNotResponding(onlyDumpSelf);
final long endTime = SystemClock.uptimeMillis();
Slog.d(TAG, "Completed ANR of " + r.mApp.processName + " in "
+ (endTime - startTime) + "ms, latency " + reportLatency
+ (onlyDumpSelf ? "ms (expired, only dump ANR app)" : "ms"));
}
mRunning.set(false);
synchronized (mAnrRecords) {
// The race should be unlikely to happen. Just to make sure we don't miss.
if (!mAnrRecords.isEmpty()) {
startAnrConsumerIfNeeded();
}
}
}
}
2.4 AnrRecord.appNotResponding
private static class AnrRecord {
final ProcessRecord mApp;
final String mActivityShortComponentName;
final String mParentShortComponentName;
final String mAnnotation;
final ApplicationInfo mAppInfo;
final WindowProcessController mParentProcess;
final boolean mAboveSystem;
final long mTimestamp = SystemClock.uptimeMillis();
AnrRecord(ProcessRecord anrProcess, String activityShortComponentName,
ApplicationInfo aInfo, String parentShortComponentName,
WindowProcessController parentProcess, boolean aboveSystem, String annotation) {
mApp = anrProcess;
mActivityShortComponentName = activityShortComponentName;
mParentShortComponentName = parentShortComponentName;
mAnnotation = annotation;
mAppInfo = aInfo;
mParentProcess = parentProcess;
mAboveSystem = aboveSystem;
}
void appNotResponding(boolean onlyDumpSelf) {
mApp.mErrorState.appNotResponding(mActivityShortComponentName, mAppInfo,
mParentShortComponentName, mParentProcess, mAboveSystem, mAnnotation,
onlyDumpSelf);
}
}
2.5 ProcessRecord.ProcessErrorStateRecord.appNotResponding
final void appNotResponding(ProcessRecord app, ActivityRecord activity,
ActivityRecord parent, boolean aboveSystem, final String annotation) {
......
updateCpuStatsNow(); //第一次 更新cpu统计信息
synchronized (this) {
//PowerManager.reboot() 会阻塞很长时间,因此忽略关机时的ANR
if (mShuttingDown) {
return;
} else if (app.notResponding) {
return;
} else if (app.crashing) {
return;
}
//记录ANR到EventLog
EventLog.writeEvent(EventLogTags.AM_ANR, app.userId, app.pid,
app.processName, app.info.flags, annotation);
// 将当前进程添加到firstPids
firstPids.add(app.pid);
int parentPid = app.pid;
//将system_server进程添加到firstPids
if (MY_PID != app.pid && MY_PID != parentPid) firstPids.add(MY_PID);
for (int i = mLruProcesses.size() - 1; i >= 0; i--) {
ProcessRecord r = mLruProcesses.get(i);
if (r != null && r.thread != null) {
int pid = r.pid;
if (pid > 0 && pid != app.pid &&
pid != parentPid && pid != MY_PID) {
if (r.persistent) {
firstPids.add(pid); //将persistent进程添加到firstPids
} else {
lastPids.put(pid, Boolean.TRUE); //其他进程添加到lastPids
}
}
}
}
}
// 记录ANR输出到main log
StringBuilder info = new StringBuilder();
info.setLength(0);
info.append("ANR in ").append(app.processName);
if (activity != null && activity.shortComponentName != null) {
info.append(" (").append(activity.shortComponentName).append(")");
}
info.append("\n");
info.append("PID: ").append(app.pid).append("\n");
if (annotation != null) {
info.append("Reason: ").append(annotation).append("\n");
}
if (parent != null && parent != activity) {
info.append("Parent: ").append(parent.shortComponentName).append("\n");
}
//创建CPU tracker对象
final ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);
//输出traces信息【见小节2】
File tracesFile = dumpStackTraces(true, firstPids, processCpuTracker,
lastPids, NATIVE_STACKS_OF_INTEREST);
updateCpuStatsNow(); //第二次更新cpu统计信息
//记录当前各个进程的CPU使用情况
synchronized (mProcessCpuTracker) {
cpuInfo = mProcessCpuTracker.printCurrentState(anrTime);
}
//记录当前CPU负载情况
info.append(processCpuTracker.printCurrentLoad());
info.append(cpuInfo);
//记录从anr时间开始的Cpu使用情况
info.append(processCpuTracker.printCurrentState(anrTime));
//输出当前ANR的reason,以及CPU使用率、负载信息
Slog.e(TAG, info.toString());
//将traces文件 和 CPU使用率信息保存到dropbox,即data/system/dropbox目录
addErrorToDropBox("anr", app, app.processName, activity, parent,
annotation, cpuInfo, tracesFile, null);
synchronized (this) {
...
//后台ANR的情况, 则直接杀掉
if (!showBackground && !app.isInterestingToUserLocked() &&
app.pid != MY_PID) {
app.kill("bg anr", true);
return;
}
//设置app的ANR状态,病查询错误报告receiver
makeAppNotRespondingLocked(app,
activity != null ? activity.shortComponentName : null,
annotation != null ? "ANR " + annotation : "ANR",
info.toString());
//重命名trace文件
String tracesPath =
SystemProperties.get("dalvik.vm.stack-trace-file", null);
if (tracesPath != null && tracesPath.length() != 0) {
//traceRenameFile = "/data/anr/traces.txt"
File traceRenameFile = new File(tracesPath);
String newTracesPath;
int lpos = tracesPath.lastIndexOf (".");
if (-1 != lpos)
// 新的traces文件= /data/anr/traces_进程名_当前日期.txt
newTracesPath = tracesPath.substring (0, lpos) +
"_" + app.processName + "_" +
mTraceDateFormat.format(new Date()) +
tracesPath.substring (lpos);
else
newTracesPath = tracesPath + "_" + app.processName;
traceRenameFile.renameTo(new File(newTracesPath));
}
//弹出ANR对话框
Message msg = Message.obtain();
HashMap<String, Object> map = new HashMap<String, Object>();
msg.what = SHOW_NOT_RESPONDING_MSG;
msg.obj = map;
msg.arg1 = aboveSystem ? 1 : 0;
map.put("app", app);
if (activity != null) {
map.put("activity", activity);
}
//向ui线程发送,内容为SHOW_NOT_RESPONDING_MSG的消息
mUiHandler.sendMessage(msg);
}
}
当发生 ANR 时,会按顺序依次执行:
- 输出 ANR Reason 信息到 EventLog. 也就是说 ANR 触发的时间点最接近的就是 EventLog 中输出的 am_anr 信息
- 收集并输出重要进程列表中的各个线程的 traces 信息,该方法较耗时
- 输出当前各个进程的 CPU 使用情况以及 CPU 负载情况
- 将 traces 文件和 CPU 使用情况信息保存到 dropbox,即 data/system/dropbox 目录
- 根据进程类型,来决定直接后台杀掉,还是弹框告知用户
ANR 输出重要进程的 traces 信息,这些进程包含:
- firstPids 队列:第一个是 ANR 进程,第二个是 system_server,剩余是所有 persistent 进程
- Native 队列:是指 /system/bin/ 目录的 mediaserver,sdcard 以及 surfaceflinger 进程
- lastPids 队列:是指 mLruProcesses 中的不属于 firstPids 的所有进程。
2.6 AMS.dumpStackTraces
File tracesFile = ActivityManagerService.dumpStackTraces(firstPids,
isSilentAnr ? null : processCpuTracker, isSilentAnr ? null : lastPids,
nativePids, tracesFileException, offsets, annotation);
public static File dumpStackTraces(boolean clearTraces,
ArrayList<Integer> firstPids, ProcessCpuTracker processCpuTracker,
SparseArray<Boolean> lastPids, String[] nativeProcs) {
//默认为 data/anr/traces.txt
String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
if (tracesPath == null || tracesPath.length() == 0) {
return null;
}
File tracesFile = new File(tracesPath);
try {
//当clearTraces,则删除已存在的traces文件
if (clearTraces && tracesFile.exists()) tracesFile.delete();
//创建traces文件
tracesFile.createNewFile();
FileUtils.setPermissions(tracesFile.getPath(), 0666, -1, -1);
} catch (IOException e) {
return null;
}
//输出trace内容【见小节3】
dumpStackTraces(tracesPath, firstPids, processCpuTracker, lastPids, nativeProcs);
return tracesFile;
}
这里会保证 data/anr/traces.txt 文件内容是全新的方式,而非追加。
2.7 AMS.dumpStackTraces
private static void dumpStackTraces(String tracesPath,
ArrayList<Integer> firstPids, ProcessCpuTracker processCpuTracker,
SparseArray<Boolean> lastPids, String[] nativeProcs) {
FileObserver observer =
new FileObserver(tracesPath, FileObserver.CLOSE_WRITE) {
@Override
public synchronized void onEvent(int event, String path) { notify(); }
};
try {
observer.startWatching();
//首先,获取最重要进程的stacks
if (firstPids != null) {
try {
int num = firstPids.size();
for (int i = 0; i < num; i++) {
synchronized (observer) {
//向目标进程发送signal来输出traces
Process.sendSignal(firstPids.get(i), Process.SIGNAL_QUIT);
observer.wait(200); //等待直到写关闭,或者200ms超时
}
}
} catch (InterruptedException e) {
Slog.wtf(TAG, e);
}
}
//下一步,获取native进程的stacks
if (nativeProcs != null) {
int[] pids = Process.getPidsForCommands(nativeProcs);
if (pids != null) {
for (int pid : pids) {
//输出native进程的trace【见小节4】
Debug.dumpNativeBacktraceToFile(pid, tracesPath);
}
}
}
if (processCpuTracker != null) {
processCpuTracker.init();
System.gc();
processCpuTracker.update();
synchronized (processCpuTracker) {
processCpuTracker.wait(500); //等待500ms
}
//测量CPU使用情况
processCpuTracker.update();
//从lastPids中选取CPU使用率 top 5的进程,输出这些进程的stacks
final int N = processCpuTracker.countWorkingStats();
int numProcs = 0;
for (int i=0; i<N && numProcs<5; i++) {
ProcessCpuTracker.Stats stats = processCpuTracker.getWorkingStats(i);
if (lastPids.indexOfKey(stats.pid) >= 0) {
numProcs++;
synchronized (observer) {
Process.sendSignal(stats.pid, Process.SIGNAL_QUIT);
observer.wait(200);
}
}
}
}
} finally {
observer.stopWatching();
}
}
该方法的主要功能,依次输出:
1.收集 firstPids 进程的 stacks
- 第一个是发生ANR进程
- 第二个是system_server
- mLruProcesses 中所有的 persistent 进程
2.收集 Native 进程的 stacks;(dumpNativeBacktraceToFile)
- 依次是 mediaserver,sdcard,surfaceflinger 进程
3.收集 lastPids 进程的 stacks
- 依次输出 CPU 使用率 top 5 的进程
- Tips: firstPids 列表中的进程,两个进程之间会休眠 200ms,可见 persistent 进程越多,则时间越长。top 5 进程的 traces 过程中,同样是间隔 200ms,另外进程使用情况的收集也是比较耗时。
2.8 dumpNativeBacktraceToFile
Debug.dumpNativeBacktraceToFile(pid, tracesPath) 经过 JNI 调用如下方法:
static void android_os_Debug_dumpNativeBacktraceToFile(JNIEnv* env,
jobject clazz, jint pid, jstring fileName) {
......
const jchar* str = env->GetStringCritical(fileName, 0);
String8 fileName8;
if (str) {
fileName8 = String8(reinterpret_cast<const char16_t*>(str),
env->GetStringLength(fileName));
env->ReleaseStringCritical(fileName, str);
}
//打开/data/anr/traces.txt
int fd = open(fileName8.string(),
O_CREAT | O_WRONLY | O_NOFOLLOW, 0666); /* -rw-rw-rw- */
......
if (lseek(fd, 0, SEEK_END) < 0) {
fprintf(stderr, "lseek: %s\n", strerror(errno));
} else {
//【见小节5】
dump_backtrace_to_file(pid, fd);
}
close(fd);
}
2.9 dump_backtrace_to_file
int dump_backtrace_to_file(pid_t tid, int fd) {
return dump_backtrace_to_file_timeout(tid, fd, 0);
}
int dump_backtrace_to_file_timeout(pid_t tid, int fd, int timeout_secs) {
//通过socket向服务端发送dump backtrace的请求
int sock_fd = make_dump_request(DEBUGGER_ACTION_DUMP_BACKTRACE, tid, timeout_secs);
if (sock_fd < 0) {
return -1;
}
int result = 0;
char buffer[1024];
ssize_t n;
//阻塞等待,从sock_fd中读取到服务端发送过来的数据,并写入buffer
while ((n = TEMP_FAILURE_RETRY(read(sock_fd, buffer, sizeof(buffer)))) > 0) {
//再将buffer数据输出到traces.txt文件
if (TEMP_FAILURE_RETRY(write(fd, buffer, n)) != n) {
result = -1;
break;
}
}
close(sock_fd);
return result;
}
可见,这个过程主要是通过向 debuggerd 守护进程发送命令 DEBUGGER_ACTION_DUMP_BACKTRACE, debuggerd 收到该命令,在子进程中调用 dump_backtrace() 来输出 backtrace,更多内容见 Native 进程之 Trace原理。
三 总结
触发 ANR 时系统会输出关键信息:(这个较耗时,可能会有10s)
1.将 am_anr 信息,输出到 EventLog (ANR 开始起点看 EventLog)
2.获取重要进程 trace 信息,保存到 /data/anr/traces.txt (会先删除老的文件)
- Java 进程的 traces
- Native 进程的 traces
3.ANR reason 以及 CPU 使用情况信息,输出到 main log
4.再将 CPU 使用情况和进程 trace 文件信息,再保存到 /data/system/dropbox
整个过程中进程 Trace 的输出是最为核心的环节,Java 和 Native 进程采用不同的策略,如下:
进程类型 | trace命令 | 文章 | 描述 |
---|---|---|---|
Java | kill -3 [pid] | 解读Java进程的Trace文件 | 不适用于Native进程 |
Native | debuggerd -b [pid] | Native进程之Trace原理 | 也适用于Java进程 |
说明:kill -3 命令需要虚拟机的支持,所以无法输出 Native 进程 traces。而 debuggerd -b [pid] 也可用于 Java 进程,但信息量远没有 kill -3多。 总之,ANR 信息最为重要的是 dropbox 信息,比如 system_server_anr。
重要节点:
- 进程名:cat /proc/[pid]/cmdline
- 线程名:cat /proc/[tid]/comm
- Kernel栈:cat /proc/[tid]/stack
- Native栈: 解析 /proc/[pid]/maps