Apollo 应用与源码分析:Monitor监控-硬件监控-CPU、磁盘、内存资源监控逻辑源码分析

目录

结构分析

代码

分析

执行主体分析

代码

分析

SystemStatus & ComponentStatus

检查磁盘空间分析

代码

分析

检查CPU 占用

代码

分析

检查内存占用

代码

分析

获取系统的内存占用函数为GetSystemMemoryUsage

获取进程的内存占用函数为GetMemoryUsage

检查磁盘负载

代码

分析

获取磁盘负载:GetSystemDiskload


结构分析

代码

class ResourceMonitor : public RecurrentRunner {
 public:
  ResourceMonitor();
  void RunOnce(const double current_time) override;

 private:
  static void UpdateStatus(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckDiskSpace(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckCPUUsage(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckMemoryUsage(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckDiskLoads(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
};

分析

  1. 继承了RecurrentRunner,重写了RunOnce,说明主要的工作任务就是在RunOnce里面
  2. 有4个静态函数分别检查磁盘空间、CPU使用、内存使用、磁盘负载。
  3. 有一个更新状态的函数UpdateStatus。

执行主体分析

代码

void ResourceMonitor::RunOnce(const double current_time) {
  auto manager = MonitorManager::Instance();
  const auto& mode = manager->GetHMIMode();
  auto* components = manager->GetStatus()->mutable_components();

  for (const auto& iter : mode.monitored_components()) {
    const std::string& name = iter.first;
    const auto& config = iter.second;
    if (config.has_resource()) {
      UpdateStatus(config.resource(),
                   components->at(name).mutable_resource_status());
    }
  }
}

分析

  1. 创建一个monitor manage 的实例;
  2. 根据这个实例获取HMI的配置,得到HMI配置的要监控的component
  3. 遍历要监控的component,并使用updatestatus 更新状态

这里要认识一些proto ,不然后面的类分析会看不懂。

SystemStatus & ComponentStatus

message ComponentStatus {
  enum Status {
    UNKNOWN = 0;
    OK = 1;
    WARN = 2;
    ERROR = 3;
    FATAL = 4;
  }
  optional Status status = 1 [default = UNKNOWN];
  optional string message = 2;
}

message Component {
  // A summary of all detailed status.
  optional ComponentStatus summary = 1;

  // Detailed status.
  optional ComponentStatus process_status = 2;
  optional ComponentStatus channel_status = 3;
  optional ComponentStatus resource_status = 4;
  optional ComponentStatus other_status = 5;
  optional ComponentStatus module_status = 6;
}

message SystemStatus {
  optional apollo.common.Header header = 1;

  map<string, ComponentStatus> hmi_modules = 7;
  map<string, Component> components = 8;

  // Some critical message for passengers. HMI should highlight it or even read
  // loudly.
  optional string passenger_msg = 4;

  // If we have this field, safety_mode should be triggered.
  // We'll check the system action and driver action continuously. If no proper
  // action was taken in a specified period of time (such as 10 seconds), EStop
  // will be sent to bring the vehicle into emergency full stop.
  optional double safety_mode_trigger_time = 5;
  optional bool require_emergency_stop = 6;

  // In simulation mode, the monitor will publish message with this field set,
  // so subscribers could identify it from the recorded messages.
  optional bool is_realtime_in_simulation = 9;

  // In some modes, other processes besides modules and monitored components may
  // need to be monitored
  map<string, ComponentStatus> other_components = 10;

  reserved 2, 3;
}

检查磁盘空间分析

代码

void ResourceMonitor::CheckDiskSpace(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  // Monitor available disk space.
  for (const auto& disk_space : config.disk_spaces()) {
    for (const auto& path : cyber::common::Glob(disk_space.path())) {
      const auto space = boost::filesystem::space(path);
      const int available_gb = static_cast<int>(space.available >> 30);
      if (available_gb < disk_space.insufficient_space_error()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_error());
        SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
      } else if (available_gb < disk_space.insufficient_space_warning()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_warning());
        SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
      }
    }
  }
}

分析

首先需要搞明白参数是什么,config参数 是从manager->GetHMIMode() 中的mode 得到的。

mode 是从下面这个函数得到的。

HMIConfig HMIWorker::LoadConfig() {
  HMIConfig config;
  // Get available modes, maps and vehicles by listing data directory.
  *config.mutable_modes() =
      ListFilesAsDict(FLAGS_hmi_modes_config_path, ".pb.txt");
  ACHECK(!config.modes().empty())
      << "No modes config loaded from " << FLAGS_hmi_modes_config_path;

  *config.mutable_maps() = ListDirAsDict(FLAGS_maps_data_path);
  *config.mutable_vehicles() = ListDirAsDict(FLAGS_vehicles_config_path);
  AINFO << "Loaded HMI config: " << config.DebugString();
  return config;
}

FLAGS_hmi_modes_config_path 可以search 到,定义在hmi_work.cc中

DEFINE_string(hmi_modes_config_path, "/apollo/modules/dreamview/conf/hmi_modes",
              "HMI modes config path.");

可以知道/apollo/modules/dreamview/conf/hmi_modes下有一些后缀是.pb.txt 定义了config。

打开对应目录,下面有下述文件

├── camera-Lidar_sensor_calibration.pb.txt
├── car_teleop.pb.txt
├── console_teleop.pb.txt
├── dev_kit_close_loop.pb.txt
├── dev_kit_debug.pb.txt
├── ipc1_mkz_close_loop.pb.txt
├── ipc1_mkz_standard_debug.pb.txt
├── ipc2_mkz_close_loop.pb.txt
├── ipc2_mkz_standard_debug.pb.txt
├── lidar-IMU_sensor_calibration.pb.txt
├── map_collection.pb.txt
├── mkz_64beam.pb.txt
├── mkz_close_loop.pb.txt
├── mkz_lgsvl.pb.txt
├── mkz_standard_debug.pb.txt
├── mkz_standard_debug_hesai.pb.txt
├── mkz_standard_debug_smart_recorder.pb.txt
├── navigation.pb.txt
├── rtk.pb.txt
└── vehicle_calibration.pb.txt

然后把下面文件中的内容,读入到HMIMode 这个结构中

message HMIMode {
  map<string, CyberModule> cyber_modules = 1;
  map<string, Module> modules = 2;
  map<string, MonitoredComponent> monitored_components = 3;
  map<string, ProcessMonitorConfig> other_components = 4;
}

这些文件中都定义了对于监控目标的预值,比如:diskspace

monitored_components {
  key: "Recorder"
  value: {
    process {
      command_keywords: "cyber_recorder record"
    }
    resource {
      disk_spaces {
        # For logs.
        path: "/apollo/data"
        insufficient_space_warning: 8
        insufficient_space_error: 2
      }
      disk_spaces {
        # For records.
        path: "/media/apollo/internal_nvme"
        insufficient_space_warning: 128
        insufficient_space_error: 32
      }
    }
  }
}

会定义出文件的存储位置,以及对应的报错预值。

再来看我们的函数主体,就知道

for (const auto& disk_space : config.disk_spaces()) {
    for (const auto& path : cyber::common::Glob(disk_space.path())) {
      const auto space = boost::filesystem::space(path);

这段代码就是遍历每一个monitor config 中配置的磁盘位置,以及空间的大小。


 if (available_gb < disk_space.insufficient_space_error()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_error());
        SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
      } else if (available_gb < disk_space.insufficient_space_warning()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_warning());
        SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
      }

这一段就是通过设置的预值来报警。

SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);

这一段就是用当前最高级的故障来表示磁盘的状态。

SummaryMonitor 属于software monitor 的部分,在software monitor中介绍。

检查CPU 占用

代码

void ResourceMonitor::CheckCPUUsage(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  for (const auto& cpu_usage : config.cpu_usages()) {
    const auto process_dag_path = cpu_usage.process_dag_path();
    float cpu_usage_value = 0.f;
    if (process_dag_path.empty()) {
      cpu_usage_value = GetSystemCPUUsage();
    } else {
      int pid = 0;
      if (GetPIDByCmdLine(process_dag_path, &pid)) {
        static std::unordered_map<std::string, uint64_t> prev_jiffies_map;
        if (prev_jiffies_map.find(process_dag_path) == prev_jiffies_map.end()) {
          prev_jiffies_map[process_dag_path] = 0;
        }
        cpu_usage_value = GetCPUUsage(pid, process_dag_path, &prev_jiffies_map);
      }
    }
    const auto high_cpu_warning = cpu_usage.high_cpu_usage_warning();
    const auto high_cpu_error = cpu_usage.high_cpu_usage_error();
    if (cpu_usage_value > high_cpu_error) {
      const std::string err = absl::StrCat(
          process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
          high_cpu_error, "%");
      SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    } else if (cpu_usage_value > high_cpu_warning) {
      const std::string warn = absl::StrCat(
          process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
          high_cpu_warning, "%");
      SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    }
  }
}

分析

核心逻辑是获取CPU占用的部分和上面将的磁盘空间分析一样。

读取配置,遍历配置,然后根据CPU的占用,给出对应的级别的报警

具体的获取方法是GetSystemCPUUsage,本质上读取/proc/stat 文件,然后通过解析文件获取到的。

float GetSystemCPUUsage() {
  const std::string system_cpu_stat_file = "/proc/stat";
  const int users = 1, system = 3, total = 7;
  constexpr static int kSystemCpuInfo = 0;
  static uint64_t prev_jiffies = 0, prev_work_jiffies = 0;
  const auto stat_lines =
      GetStatsLines(system_cpu_stat_file, kSystemCpuInfo + 1);
  if (stat_lines.size() <= kSystemCpuInfo) {
    AERROR << "failed to load contents from " << system_cpu_stat_file;
    return 0.f;
  }
  const std::vector<std::string> jiffies_stats =
      absl::StrSplit(stat_lines[kSystemCpuInfo], ' ', absl::SkipWhitespace());
  if (jiffies_stats.size() <= total) {
    AERROR << "failed to get system CPU info from " << system_cpu_stat_file;
    return 0.f;
  }
  uint64_t jiffies = 0, work_jiffies = 0;
  for (int cur_stat = users; cur_stat <= total; ++cur_stat) {
    const auto cur_stat_value = std::stoll(jiffies_stats[cur_stat]);
    jiffies += cur_stat_value;
    if (cur_stat <= system) {
      work_jiffies += cur_stat_value;
    }
  }
  const uint64_t tmp_prev_jiffies = prev_jiffies;
  const uint64_t tmp_prev_work_jiffies = prev_work_jiffies;
  prev_jiffies = jiffies;
  prev_work_jiffies = work_jiffies;
  if (tmp_prev_jiffies == 0) {
    return 0.f;
  }
  return 100.f * (static_cast<float>(work_jiffies - tmp_prev_work_jiffies) /
                  (jiffies - tmp_prev_jiffies));
}

检查内存占用

代码

void ResourceMonitor::CheckMemoryUsage(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  for (const auto& memory_usage : config.memory_usages()) {
    const auto process_dag_path = memory_usage.process_dag_path();
    float memory_usage_value = 0.f;
    if (process_dag_path.empty()) {
      memory_usage_value = GetSystemMemoryUsage();
    } else {
      int pid = 0;
      if (GetPIDByCmdLine(process_dag_path, &pid)) {
        memory_usage_value = GetMemoryUsage(pid, process_dag_path);
      }
    }
    const auto high_memory_warning = memory_usage.high_memory_usage_warning();
    const auto high_memory_error = memory_usage.high_memory_usage_error();
    if (memory_usage_value > static_cast<float>(high_memory_error)) {
      const std::string err = absl::StrCat(
          process_dag_path, " has high memory usage: ", memory_usage_value,
          " > ", high_memory_error);
      SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    } else if (memory_usage_value > static_cast<float>(high_memory_warning)) {
      const std::string warn = absl::StrCat(
          process_dag_path, " has high memory usage: ", memory_usage_value,
          " > ", high_memory_warning);
      SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    }
  }
}

分析

  1. 读取配置文件
  2. 遍历配置,判断process_dag_path是否为空,如果是,就读取整个系统的内存占用,如果不是就读取这个进程的内存占用
  3. 根据设定要的预值来判定是不是需要报警

获取系统的内存占用函数为GetSystemMemoryUsage

float GetSystemMemoryUsage() {
  const std::string system_mem_stat_file = "/proc/meminfo";
  const int mem_total = 0, mem_free = 1, buffers = 3, cached = 4,
            swap_total = 14, swap_free = 15, slab = 21;
  const auto stat_lines = GetStatsLines(system_mem_stat_file, slab + 1);
  if (stat_lines.size() <= slab) {
    AERROR << "failed to load contents from " << system_mem_stat_file;
    return 0.f;
  }
  const auto total_memory =
      GetSystemMemoryValueFromLine(stat_lines[mem_total]) +
      GetSystemMemoryValueFromLine(stat_lines[swap_total]);
  int64_t used_memory = total_memory;
  for (int cur_line = mem_free; cur_line <= slab; ++cur_line) {
    if (cur_line == mem_free || cur_line == buffers || cur_line == cached ||
        cur_line == swap_free || cur_line == slab) {
      used_memory -= GetSystemMemoryValueFromLine(stat_lines[cur_line]);
    }
  }
  return 100.f * (static_cast<float>(used_memory) / total_memory);
}

获取进程的内存占用函数为GetMemoryUsage

float GetMemoryUsage(const int pid, const std::string& process_name) {
  const std::string memory_stat_file = absl::StrCat("/proc/", pid, "/statm");
  const uint32_t page_size_kb = (sysconf(_SC_PAGE_SIZE) >> 10);
  const int resident_idx = 1, gb_2_kb = (1 << 20);
  constexpr static int kMemoryInfo = 0;

  const auto stat_lines = GetStatsLines(memory_stat_file, kMemoryInfo + 1);
  if (stat_lines.size() <= kMemoryInfo) {
    AERROR << "failed to load contents from " << memory_stat_file;
    return 0.f;
  }
  const std::vector<std::string> stats =
      absl::StrSplit(stat_lines[kMemoryInfo], ' ', absl::SkipWhitespace());
  if (stats.size() <= resident_idx) {
    AERROR << "failed to get memory info for process " << process_name;
    return 0.f;
  }
  return static_cast<float>(std::stoll(stats[resident_idx]) * page_size_kb) /
         gb_2_kb;
}

都是读linux 下/proc 下的文件得到的。

检查磁盘负载

代码

void ResourceMonitor::CheckDiskLoads(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  for (const auto& disk_load : config.disk_load_usages()) {
    const auto disk_load_value = GetSystemDiskload(disk_load.device_name());
    const auto high_disk_load_warning = disk_load.high_disk_load_warning();
    const auto high_disk_load_error = disk_load.high_disk_load_error();
    if (disk_load_value > static_cast<float>(high_disk_load_error)) {
      const std::string err = absl::StrCat(
          disk_load.device_name(), " has high disk load: ", disk_load_value,
          " > ", high_disk_load_error);
      SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    } else if (disk_load_value > static_cast<float>(high_disk_load_warning)) {
      const std::string warn = absl::StrCat(
          disk_load.device_name(), " has high disk load: ", disk_load_value,
          " > ", high_disk_load_warning);
      SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    }
  }
}

分析

  1. 分析配置文件,找到磁盘负载配置
  2. 遍历配置,获取磁盘的实际负载,并根据设置的阈值进行故障判断

获取磁盘负载:GetSystemDiskload

float GetSystemDiskload(const std::string& device_name) {
  const std::string disks_stat_file = "/proc/diskstats";
  const int device = 2, in_out_ms = 12;
  const int seconds_to_ms = 1000;
  constexpr static int kDiskInfo = 128;
  static uint64_t prev_disk_stats = 0;

  const auto stat_lines = GetStatsLines(disks_stat_file, kDiskInfo);
  uint64_t disk_stats = 0;
  for (const auto& line : stat_lines) {
    const std::vector<std::string> stats =
        absl::StrSplit(line, ' ', absl::SkipWhitespace());
    if (stats[device] == device_name) {
      disk_stats = std::stoll(stats[in_out_ms]);
      break;
    }
  }
  const uint64_t tmp_prev_disk_stats = prev_disk_stats;
  prev_disk_stats = disk_stats;
  if (tmp_prev_disk_stats == 0) {
    return 0.f;
  }
  return 100.f *
         (static_cast<float>(disk_stats - tmp_prev_disk_stats) /
          static_cast<float>(FLAGS_resource_monitor_interval * seconds_to_ms));
}

猜你喜欢

转载自blog.csdn.net/qq_32378713/article/details/128057369