文章目录
前言
前面分析了KOOM Java leak、native leak的监控思路,这篇继续分析它线程监控的整体思路。
使用
来看下使用方法,依旧是很简洁的三行代码即可开启监控:
initMonitor()
ThreadMonitor.startTrackAsync()
ThreadMonitor.stop()
initMonitor
先看看怎么初始化监控器的:
private fun initMonitor() {
val listener = object : ThreadLeakListener {
override fun onReport(leaks: MutableList<ThreadLeakRecord>) {
leaks.forEach {
MonitorLog.i(LOG_TAG, it.toString())
}
mLeakText.post {
mLeakText.text = "leak threads: ${leaks.map { it.name }}"
}
}
override fun onError(msg: String) {
MonitorLog.e(LOG_TAG, msg)
mErrorText.post {
mErrorText.text = msg
}
}
}
if (!ThreadMonitor.isInitialized) {
val config = ThreadMonitorConfig.Builder()
.enableThreadLeakCheck(2 * 1000L, 5 * 1000L)
.setListener(listener)
.build()
MonitorManager.addMonitorConfig(config)
} else {
ThreadMonitor.setListener(listener)
}
}
依旧是构建者模式设置一些参数,然后通过接口回调的方式开启监听。
看下ThreadLeakRecord张什么样子:
@Keep
data class ThreadLeakRecord(
val tid: Int,
val createTime: Long,
val startTime: Long,
val endTime: Long,
val name: String,
val createCallStack: String) {
override fun toString(): String = StringBuilder().apply {
append("tid: $tid\n")
append("createTime: $createTime Byte\n")
append("startTime: $startTime\n")
append("endTime: $endTime\n")
append("name: $name\n")
append("createCallStack:\n")
append(createCallStack)
}.toString()
}
@Keep
data class ThreadLeakContainer(
val type: String,
val threads: MutableList<ThreadLeakRecord>)
ThreadMonitor.startTrackAsync()
fun startTrackAsync() {
getLoopHandler().postAtFrontOfQueue {
startTrack()
}
}
这里handler执行的是postAtFrontOfQueue,这个方法会把当前任务post到消息队列的队列头里面,优先执行。
fun startTrack() {
if (handleNativeInit()) {
mIsRunning = true
startLoop(clearQueue = true, postAtFront = false, delayMillis = monitorConfig.startDelay)
}
}
private fun handleNativeInit(): Boolean {
if (Build.VERSION.SDK_INT <= Build.VERSION_CODES.O || Build.VERSION.SDK_INT > Build
.VERSION_CODES.R) {
monitorConfig.listener?.onError("not support P below or R above now!")
return false
}
if (!isArm64()) {
monitorConfig.listener?.onError("support arm64 only!")
return false
}
if (loadSoQuietly("koom-thread")) {
MonitorLog.i(TAG, "loadLibrary success")
} else {
monitorConfig.listener?.onError("loadLibrary fail")
return false
}
if (monitorConfig.disableNativeStack) {
NativeHandler.disableNativeStack()
}
if (monitorConfig.disableJavaStack) {
NativeHandler.disableJavaStack()
}
if (monitorConfig.enableNativeLog) {
NativeHandler.enableNativeLog()
}
NativeHandler.setThreadLeakDelay(monitorConfig.threadLeakDelay)
NativeHandler.start()
MonitorLog.i(TAG, "init finish")
return true
}
这里看出线程监控当前只支持SDK 版本26-30,同时只支持arm64架构。
后面就是设置一些参数
std::atomic<bool> CallStack::disableJava;
std::atomic<bool> CallStack::disableNative;
void CallStack::DisableNative() {
disableNative = true; }
位于cpp/src/common/callstack.cpp里面。
重点需要关注的是NativeHandler.start()
方法:
JNIEXPORT void JNICALL
Java_com_kwai_performance_overhead_thread_monitor_NativeHandler_start(
JNIEnv *env, jclass obj) {
koom::Log::info("koom-thread", "start");
koom::Start();
}
显示打印了一些日志,然后start:
Start
void Start() {
if (isRunning) {
return;
}
// 初始化数据
delete sHookLooper;
sHookLooper = new HookLooper();
koom::ThreadHooker::Start();
isRunning = true;
}
HookLooper
#include "hook_looper.h"
#include "koom.h"
#include "loop_item.h"
namespace koom {
const char *looper_tag = "koom-hook-looper";
HookLooper::HookLooper() : looper() {
this->holder = new koom::ThreadHolder(); }
HookLooper::~HookLooper() {
delete this->holder; }
void HookLooper::handle(int what, void *data) {
looper::handle(what, data);
switch (what) {
case ACTION_ADD_THREAD: {
koom::Log::info(looper_tag, "AddThread");
auto info = static_cast<HookAddInfo *>(data);
holder->AddThread(info->tid, info->pthread, info->is_thread_detached,
info->time, info->create_arg);
delete info;
break;
}
case ACTION_JOIN_THREAD: {
koom::Log::info(looper_tag, "JoinThread");
auto info = static_cast<HookInfo *>(data);
holder->JoinThread(info->thread_id);
delete info;
break;
}
case ACTION_DETACH_THREAD: {
koom::Log::info(looper_tag, "DetachThread");
auto info = static_cast<HookInfo *>(data);
holder->DetachThread(info->thread_id);
delete info;
break;
}
case ACTION_EXIT_THREAD: {
koom::Log::info(looper_tag, "ExitThread");
auto info = static_cast<HookExitInfo *>(data);
holder->ExitThread(info->thread_id, info->threadName, info->time);
delete info;
break;
}
case ACTION_REFRESH: {
koom::Log::info(looper_tag, "Refresh");
auto info = static_cast<SimpleHookInfo *>(data);
holder->ReportThreadLeak(info->time);
delete info;
break;
}
default: {
}
}
}
void HookLooper::post(int what, void *data) {
looper::post(what, data); }
} // namespace koom
这个类用来相应handler接收到的message,可以推导出,后面会有hook线程的工作,然后在线程的一些生命周期里面进行一些信息的记录和日志的打印。
InitHook
void ThreadHooker::InitHook() {
koom::Log::info(thread_tag, "HookSo init hook");
std::set<std::string> libs;
DlopenCb::GetInstance().GetLoadedLibs(libs);
HookLibs(libs, Constant::kDlopenSourceInit);
DlopenCb::GetInstance().AddCallback(DlopenCallback);
}
关键代码有三行,下面逐一来分析:
GetLoadedLibs
void DlopenCb::GetLoadedLibs(std::set<std::string> &libs, bool refresh) {
if (refresh) {
std::string empty;
Refresh(dlopen_source_get_libs, empty);
}
XH_LOG_INFO("GetLoadedLibs origin %d", hooked_libs.size());
pthread_mutex_lock(&add_lib_mutex);
std::copy(
hooked_libs.begin(), hooked_libs.end(),
std::inserter(libs, libs.begin()));
pthread_mutex_unlock(&add_lib_mutex);
}
这里把一些已经加载的动态库添加到set容器中。
void ThreadHooker::Start() {
ThreadHooker::InitHook(); }
HookLibs
看下HookLibs:
void ThreadHooker::HookLibs(std::set<std::string> &libs, int source) {
koom::Log::info(thread_tag, "HookSo lib size %d", libs.size());
if (libs.empty()) {
return;
}
bool hooked = false;
pthread_mutex_lock(&DlopenCb::hook_mutex);
xhook_clear();
for (const auto &lib : libs) {
hooked |= ThreadHooker::RegisterSo(lib, source);
}
if (hooked) {
int result = xhook_refresh(0);
koom::Log::info(thread_tag, "HookSo lib Refresh result %d", result);
}
pthread_mutex_unlock(&DlopenCb::hook_mutex);
}
这里是遍历了set容器里面的所有库文件,然后hook这些lib。
这里面也进行了加锁操作,定义hooked bool变量,假如hook成功其中一个lib,就算hook成功。在后面进行xhook_refresh操作,然后打印日志。
RegisterSo
看起来是个很重要的函数,来分析下:
bool ThreadHooker::RegisterSo(const std::string &lib, int source) {
if (IsLibIgnored(lib)) {
return false;
}
auto lib_ctr = lib.c_str();
koom::Log::info(thread_tag, "HookSo %d %s", source, lib_ctr);
xhook_register(lib_ctr, "pthread_create",
reinterpret_cast<void *>(HookThreadCreate), nullptr);
xhook_register(lib_ctr, "pthread_detach",
reinterpret_cast<void *>(HookThreadDetach), nullptr);
xhook_register(lib_ctr, "pthread_join",
reinterpret_cast<void *>(HookThreadJoin), nullptr);
xhook_register(lib_ctr, "pthread_exit",
reinterpret_cast<void *>(HookThreadExit), nullptr);
return true;
}
果然,这里hook了这几个函数:
- pthread_create
- pthread_detach
- pthread_join
- pthread_exit
HookThreadCreate
int ThreadHooker::HookThreadCreate(pthread_t *tidp, const pthread_attr_t *attr,
void *(*start_rtn)(void *), void *arg) {
if (hookEnabled() && start_rtn != nullptr) {
auto time = Util::CurrentTimeNs();
koom::Log::info(thread_tag, "HookThreadCreate");
auto *hook_arg = new StartRtnArg(arg, Util::CurrentTimeNs(), start_rtn);
auto *thread_create_arg = hook_arg->thread_create_arg;
void *thread = koom::CallStack::GetCurrentThread();
if (thread != nullptr) {
koom::CallStack::JavaStackTrace(thread,
hook_arg->thread_create_arg->java_stack);
}
koom::CallStack::FastUnwind(thread_create_arg->pc,
koom::Constant::kMaxCallStackDepth);
thread_create_arg->stack_time = Util::CurrentTimeNs() - time;
return pthread_create(tidp, attr,
reinterpret_cast<void *(*)(void *)>(HookThreadStart),
reinterpret_cast<void *>(hook_arg));
}
return pthread_create(tidp, attr, start_rtn, arg);
}
HookThreadDetach
int ThreadHooker::HookThreadDetach(pthread_t t) {
if (!hookEnabled()) return pthread_detach(t);
int c_tid = (int)syscall(SYS_gettid);
koom::Log::info(thread_tag, "HookThreadDetach c_tid:%0x", c_tid);
auto info = new HookInfo(t, Util::CurrentTimeNs());
sHookLooper->post(ACTION_DETACH_THREAD, info);
return pthread_detach(t);
}
HookThreadJoin
int ThreadHooker::HookThreadJoin(pthread_t t, void **return_value) {
if (!hookEnabled()) return pthread_join(t, return_value);
int c_tid = (int)syscall(SYS_gettid);
koom::Log::info(thread_tag, "HookThreadJoin c_tid:%0x", c_tid);
auto info = new HookInfo(t, Util::CurrentTimeNs());
sHookLooper->post(ACTION_JOIN_THREAD, info);
return pthread_join(t, return_value);
}
HookThreadExit
void ThreadHooker::HookThreadExit(void *return_value) {
if (!hookEnabled()) pthread_exit(return_value);
koom::Log::info(thread_tag, "HookThreadExit");
int tid = (int)syscall(SYS_gettid);
char thread_name[16]{
};
prctl(PR_GET_NAME, thread_name);
auto info =
new HookExitInfo(pthread_self(), tid, thread_name, Util::CurrentTimeNs());
sHookLooper->post(ACTION_EXIT_THREAD, info);
pthread_exit(return_value);
}
上面hook了系统进行线程操作的函数,然后通过通过sHookLooper->post 的方式,把message post到一个handler里面进行处理。到这里也就印证了前面分析到的,HookLooper那一节handler的处理了。
AddCallback
回退一下,hook libs之后执行DlopenCb::GetInstance().AddCallback(DlopenCallback);
。
callback代码:
void ThreadHooker::DlopenCallback(std::set<std::string> &libs, int source,
std::string &source_lib) {
HookLibs(libs, source);
}
可以看到里面也是执行了HookLibs方法的代码。
总结
上述就是KOOM线程监控实现的逻辑走向,整体下来知道了整体的思路。但是其实还有相当多的细节值得我们深究和学习,后续会继续学习和分享。