目录
SystemStatus & ComponentStatus
获取系统的内存占用函数为GetSystemMemoryUsage
结构分析
代码
class ResourceMonitor : public RecurrentRunner {
public:
ResourceMonitor();
void RunOnce(const double current_time) override;
private:
static void UpdateStatus(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status);
static void CheckDiskSpace(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status);
static void CheckCPUUsage(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status);
static void CheckMemoryUsage(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status);
static void CheckDiskLoads(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status);
};
分析
- 继承了RecurrentRunner,重写了RunOnce,说明主要的工作任务就是在RunOnce里面
- 有4个静态函数分别检查磁盘空间、CPU使用、内存使用、磁盘负载。
- 有一个更新状态的函数UpdateStatus。
执行主体分析
代码
void ResourceMonitor::RunOnce(const double current_time) {
auto manager = MonitorManager::Instance();
const auto& mode = manager->GetHMIMode();
auto* components = manager->GetStatus()->mutable_components();
for (const auto& iter : mode.monitored_components()) {
const std::string& name = iter.first;
const auto& config = iter.second;
if (config.has_resource()) {
UpdateStatus(config.resource(),
components->at(name).mutable_resource_status());
}
}
}
分析
- 创建一个monitor manage 的实例;
- 根据这个实例获取HMI的配置,得到HMI配置的要监控的component
- 遍历要监控的component,并使用updatestatus 更新状态
这里要认识一些proto ,不然后面的类分析会看不懂。
SystemStatus & ComponentStatus
message ComponentStatus {
enum Status {
UNKNOWN = 0;
OK = 1;
WARN = 2;
ERROR = 3;
FATAL = 4;
}
optional Status status = 1 [default = UNKNOWN];
optional string message = 2;
}
message Component {
// A summary of all detailed status.
optional ComponentStatus summary = 1;
// Detailed status.
optional ComponentStatus process_status = 2;
optional ComponentStatus channel_status = 3;
optional ComponentStatus resource_status = 4;
optional ComponentStatus other_status = 5;
optional ComponentStatus module_status = 6;
}
message SystemStatus {
optional apollo.common.Header header = 1;
map<string, ComponentStatus> hmi_modules = 7;
map<string, Component> components = 8;
// Some critical message for passengers. HMI should highlight it or even read
// loudly.
optional string passenger_msg = 4;
// If we have this field, safety_mode should be triggered.
// We'll check the system action and driver action continuously. If no proper
// action was taken in a specified period of time (such as 10 seconds), EStop
// will be sent to bring the vehicle into emergency full stop.
optional double safety_mode_trigger_time = 5;
optional bool require_emergency_stop = 6;
// In simulation mode, the monitor will publish message with this field set,
// so subscribers could identify it from the recorded messages.
optional bool is_realtime_in_simulation = 9;
// In some modes, other processes besides modules and monitored components may
// need to be monitored
map<string, ComponentStatus> other_components = 10;
reserved 2, 3;
}
检查磁盘空间分析
代码
void ResourceMonitor::CheckDiskSpace(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status) {
// Monitor available disk space.
for (const auto& disk_space : config.disk_spaces()) {
for (const auto& path : cyber::common::Glob(disk_space.path())) {
const auto space = boost::filesystem::space(path);
const int available_gb = static_cast<int>(space.available >> 30);
if (available_gb < disk_space.insufficient_space_error()) {
const std::string err =
absl::StrCat(path, " has insufficient space: ", available_gb,
"GB < ", disk_space.insufficient_space_error());
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
} else if (available_gb < disk_space.insufficient_space_warning()) {
const std::string err =
absl::StrCat(path, " has insufficient space: ", available_gb,
"GB < ", disk_space.insufficient_space_warning());
SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
}
}
}
}
分析
首先需要搞明白参数是什么,config参数 是从manager->GetHMIMode() 中的mode 得到的。
mode 是从下面这个函数得到的。
HMIConfig HMIWorker::LoadConfig() {
HMIConfig config;
// Get available modes, maps and vehicles by listing data directory.
*config.mutable_modes() =
ListFilesAsDict(FLAGS_hmi_modes_config_path, ".pb.txt");
ACHECK(!config.modes().empty())
<< "No modes config loaded from " << FLAGS_hmi_modes_config_path;
*config.mutable_maps() = ListDirAsDict(FLAGS_maps_data_path);
*config.mutable_vehicles() = ListDirAsDict(FLAGS_vehicles_config_path);
AINFO << "Loaded HMI config: " << config.DebugString();
return config;
}
FLAGS_hmi_modes_config_path 可以search 到,定义在hmi_work.cc中
DEFINE_string(hmi_modes_config_path, "/apollo/modules/dreamview/conf/hmi_modes", "HMI modes config path.");
可以知道/apollo/modules/dreamview/conf/hmi_modes下有一些后缀是.pb.txt 定义了config。
打开对应目录,下面有下述文件
├── camera-Lidar_sensor_calibration.pb.txt
├── car_teleop.pb.txt
├── console_teleop.pb.txt
├── dev_kit_close_loop.pb.txt
├── dev_kit_debug.pb.txt
├── ipc1_mkz_close_loop.pb.txt
├── ipc1_mkz_standard_debug.pb.txt
├── ipc2_mkz_close_loop.pb.txt
├── ipc2_mkz_standard_debug.pb.txt
├── lidar-IMU_sensor_calibration.pb.txt
├── map_collection.pb.txt
├── mkz_64beam.pb.txt
├── mkz_close_loop.pb.txt
├── mkz_lgsvl.pb.txt
├── mkz_standard_debug.pb.txt
├── mkz_standard_debug_hesai.pb.txt
├── mkz_standard_debug_smart_recorder.pb.txt
├── navigation.pb.txt
├── rtk.pb.txt
└── vehicle_calibration.pb.txt
然后把下面文件中的内容,读入到HMIMode 这个结构中
message HMIMode {
map<string, CyberModule> cyber_modules = 1;
map<string, Module> modules = 2;
map<string, MonitoredComponent> monitored_components = 3;
map<string, ProcessMonitorConfig> other_components = 4;
}
这些文件中都定义了对于监控目标的预值,比如:diskspace
monitored_components {
key: "Recorder"
value: {
process {
command_keywords: "cyber_recorder record"
}
resource {
disk_spaces {
# For logs.
path: "/apollo/data"
insufficient_space_warning: 8
insufficient_space_error: 2
}
disk_spaces {
# For records.
path: "/media/apollo/internal_nvme"
insufficient_space_warning: 128
insufficient_space_error: 32
}
}
}
}
会定义出文件的存储位置,以及对应的报错预值。
再来看我们的函数主体,就知道
for (const auto& disk_space : config.disk_spaces()) {
for (const auto& path : cyber::common::Glob(disk_space.path())) {
const auto space = boost::filesystem::space(path);
这段代码就是遍历每一个monitor config 中配置的磁盘位置,以及空间的大小。
if (available_gb < disk_space.insufficient_space_error()) {
const std::string err =
absl::StrCat(path, " has insufficient space: ", available_gb,
"GB < ", disk_space.insufficient_space_error());
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
} else if (available_gb < disk_space.insufficient_space_warning()) {
const std::string err =
absl::StrCat(path, " has insufficient space: ", available_gb,
"GB < ", disk_space.insufficient_space_warning());
SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
}
这一段就是通过设置的预值来报警。
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
这一段就是用当前最高级的故障来表示磁盘的状态。
SummaryMonitor 属于software monitor 的部分,在software monitor中介绍。
检查CPU 占用
代码
void ResourceMonitor::CheckCPUUsage(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status) {
for (const auto& cpu_usage : config.cpu_usages()) {
const auto process_dag_path = cpu_usage.process_dag_path();
float cpu_usage_value = 0.f;
if (process_dag_path.empty()) {
cpu_usage_value = GetSystemCPUUsage();
} else {
int pid = 0;
if (GetPIDByCmdLine(process_dag_path, &pid)) {
static std::unordered_map<std::string, uint64_t> prev_jiffies_map;
if (prev_jiffies_map.find(process_dag_path) == prev_jiffies_map.end()) {
prev_jiffies_map[process_dag_path] = 0;
}
cpu_usage_value = GetCPUUsage(pid, process_dag_path, &prev_jiffies_map);
}
}
const auto high_cpu_warning = cpu_usage.high_cpu_usage_warning();
const auto high_cpu_error = cpu_usage.high_cpu_usage_error();
if (cpu_usage_value > high_cpu_error) {
const std::string err = absl::StrCat(
process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
high_cpu_error, "%");
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
} else if (cpu_usage_value > high_cpu_warning) {
const std::string warn = absl::StrCat(
process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
high_cpu_warning, "%");
SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
}
}
}
分析
核心逻辑是获取CPU占用的部分和上面将的磁盘空间分析一样。
读取配置,遍历配置,然后根据CPU的占用,给出对应的级别的报警
具体的获取方法是GetSystemCPUUsage,本质上读取/proc/stat 文件,然后通过解析文件获取到的。
float GetSystemCPUUsage() {
const std::string system_cpu_stat_file = "/proc/stat";
const int users = 1, system = 3, total = 7;
constexpr static int kSystemCpuInfo = 0;
static uint64_t prev_jiffies = 0, prev_work_jiffies = 0;
const auto stat_lines =
GetStatsLines(system_cpu_stat_file, kSystemCpuInfo + 1);
if (stat_lines.size() <= kSystemCpuInfo) {
AERROR << "failed to load contents from " << system_cpu_stat_file;
return 0.f;
}
const std::vector<std::string> jiffies_stats =
absl::StrSplit(stat_lines[kSystemCpuInfo], ' ', absl::SkipWhitespace());
if (jiffies_stats.size() <= total) {
AERROR << "failed to get system CPU info from " << system_cpu_stat_file;
return 0.f;
}
uint64_t jiffies = 0, work_jiffies = 0;
for (int cur_stat = users; cur_stat <= total; ++cur_stat) {
const auto cur_stat_value = std::stoll(jiffies_stats[cur_stat]);
jiffies += cur_stat_value;
if (cur_stat <= system) {
work_jiffies += cur_stat_value;
}
}
const uint64_t tmp_prev_jiffies = prev_jiffies;
const uint64_t tmp_prev_work_jiffies = prev_work_jiffies;
prev_jiffies = jiffies;
prev_work_jiffies = work_jiffies;
if (tmp_prev_jiffies == 0) {
return 0.f;
}
return 100.f * (static_cast<float>(work_jiffies - tmp_prev_work_jiffies) /
(jiffies - tmp_prev_jiffies));
}
检查内存占用
代码
void ResourceMonitor::CheckMemoryUsage(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status) {
for (const auto& memory_usage : config.memory_usages()) {
const auto process_dag_path = memory_usage.process_dag_path();
float memory_usage_value = 0.f;
if (process_dag_path.empty()) {
memory_usage_value = GetSystemMemoryUsage();
} else {
int pid = 0;
if (GetPIDByCmdLine(process_dag_path, &pid)) {
memory_usage_value = GetMemoryUsage(pid, process_dag_path);
}
}
const auto high_memory_warning = memory_usage.high_memory_usage_warning();
const auto high_memory_error = memory_usage.high_memory_usage_error();
if (memory_usage_value > static_cast<float>(high_memory_error)) {
const std::string err = absl::StrCat(
process_dag_path, " has high memory usage: ", memory_usage_value,
" > ", high_memory_error);
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
} else if (memory_usage_value > static_cast<float>(high_memory_warning)) {
const std::string warn = absl::StrCat(
process_dag_path, " has high memory usage: ", memory_usage_value,
" > ", high_memory_warning);
SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
}
}
}
分析
- 读取配置文件
- 遍历配置,判断process_dag_path是否为空,如果是,就读取整个系统的内存占用,如果不是就读取这个进程的内存占用
- 根据设定要的预值来判定是不是需要报警
获取系统的内存占用函数为GetSystemMemoryUsage
float GetSystemMemoryUsage() {
const std::string system_mem_stat_file = "/proc/meminfo";
const int mem_total = 0, mem_free = 1, buffers = 3, cached = 4,
swap_total = 14, swap_free = 15, slab = 21;
const auto stat_lines = GetStatsLines(system_mem_stat_file, slab + 1);
if (stat_lines.size() <= slab) {
AERROR << "failed to load contents from " << system_mem_stat_file;
return 0.f;
}
const auto total_memory =
GetSystemMemoryValueFromLine(stat_lines[mem_total]) +
GetSystemMemoryValueFromLine(stat_lines[swap_total]);
int64_t used_memory = total_memory;
for (int cur_line = mem_free; cur_line <= slab; ++cur_line) {
if (cur_line == mem_free || cur_line == buffers || cur_line == cached ||
cur_line == swap_free || cur_line == slab) {
used_memory -= GetSystemMemoryValueFromLine(stat_lines[cur_line]);
}
}
return 100.f * (static_cast<float>(used_memory) / total_memory);
}
获取进程的内存占用函数为GetMemoryUsage
float GetMemoryUsage(const int pid, const std::string& process_name) {
const std::string memory_stat_file = absl::StrCat("/proc/", pid, "/statm");
const uint32_t page_size_kb = (sysconf(_SC_PAGE_SIZE) >> 10);
const int resident_idx = 1, gb_2_kb = (1 << 20);
constexpr static int kMemoryInfo = 0;
const auto stat_lines = GetStatsLines(memory_stat_file, kMemoryInfo + 1);
if (stat_lines.size() <= kMemoryInfo) {
AERROR << "failed to load contents from " << memory_stat_file;
return 0.f;
}
const std::vector<std::string> stats =
absl::StrSplit(stat_lines[kMemoryInfo], ' ', absl::SkipWhitespace());
if (stats.size() <= resident_idx) {
AERROR << "failed to get memory info for process " << process_name;
return 0.f;
}
return static_cast<float>(std::stoll(stats[resident_idx]) * page_size_kb) /
gb_2_kb;
}
都是读linux 下/proc 下的文件得到的。
检查磁盘负载
代码
void ResourceMonitor::CheckDiskLoads(
const apollo::dreamview::ResourceMonitorConfig& config,
ComponentStatus* status) {
for (const auto& disk_load : config.disk_load_usages()) {
const auto disk_load_value = GetSystemDiskload(disk_load.device_name());
const auto high_disk_load_warning = disk_load.high_disk_load_warning();
const auto high_disk_load_error = disk_load.high_disk_load_error();
if (disk_load_value > static_cast<float>(high_disk_load_error)) {
const std::string err = absl::StrCat(
disk_load.device_name(), " has high disk load: ", disk_load_value,
" > ", high_disk_load_error);
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
} else if (disk_load_value > static_cast<float>(high_disk_load_warning)) {
const std::string warn = absl::StrCat(
disk_load.device_name(), " has high disk load: ", disk_load_value,
" > ", high_disk_load_warning);
SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
}
}
}
分析
- 分析配置文件,找到磁盘负载配置
- 遍历配置,获取磁盘的实际负载,并根据设置的阈值进行故障判断
获取磁盘负载:GetSystemDiskload
float GetSystemDiskload(const std::string& device_name) {
const std::string disks_stat_file = "/proc/diskstats";
const int device = 2, in_out_ms = 12;
const int seconds_to_ms = 1000;
constexpr static int kDiskInfo = 128;
static uint64_t prev_disk_stats = 0;
const auto stat_lines = GetStatsLines(disks_stat_file, kDiskInfo);
uint64_t disk_stats = 0;
for (const auto& line : stat_lines) {
const std::vector<std::string> stats =
absl::StrSplit(line, ' ', absl::SkipWhitespace());
if (stats[device] == device_name) {
disk_stats = std::stoll(stats[in_out_ms]);
break;
}
}
const uint64_t tmp_prev_disk_stats = prev_disk_stats;
prev_disk_stats = disk_stats;
if (tmp_prev_disk_stats == 0) {
return 0.f;
}
return 100.f *
(static_cast<float>(disk_stats - tmp_prev_disk_stats) /
static_cast<float>(FLAGS_resource_monitor_interval * seconds_to_ms));
}