目录
SystemStatus & ComponentStatus
获取系统的内存占用函数为GetSystemMemoryUsage
- class ResourceMonitor : public RecurrentRunner {
- public:
- ResourceMonitor();
- void RunOnce(const double current_time) override;
-
- private:
- static void UpdateStatus(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status);
- static void CheckDiskSpace(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status);
- static void CheckCPUUsage(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status);
- static void CheckMemoryUsage(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status);
- static void CheckDiskLoads(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status);
- };
- void ResourceMonitor::RunOnce(const double current_time) {
- auto manager = MonitorManager::Instance();
- const auto& mode = manager->GetHMIMode();
- auto* components = manager->GetStatus()->mutable_components();
-
- for (const auto& iter : mode.monitored_components()) {
- const std::string& name = iter.first;
- const auto& config = iter.second;
- if (config.has_resource()) {
- UpdateStatus(config.resource(),
- components->at(name).mutable_resource_status());
- }
- }
- }
这里要认识一些proto ,不然后面的类分析会看不懂。
- message ComponentStatus {
- enum Status {
- UNKNOWN = 0;
- OK = 1;
- WARN = 2;
- ERROR = 3;
- FATAL = 4;
- }
- optional Status status = 1 [default = UNKNOWN];
- optional string message = 2;
- }
-
- message Component {
- // A summary of all detailed status.
- optional ComponentStatus summary = 1;
-
- // Detailed status.
- optional ComponentStatus process_status = 2;
- optional ComponentStatus channel_status = 3;
- optional ComponentStatus resource_status = 4;
- optional ComponentStatus other_status = 5;
- optional ComponentStatus module_status = 6;
- }
-
- message SystemStatus {
- optional apollo.common.Header header = 1;
-
- map
hmi_modules = 7; - map
components = 8; -
- // Some critical message for passengers. HMI should highlight it or even read
- // loudly.
- optional string passenger_msg = 4;
-
- // If we have this field, safety_mode should be triggered.
- // We'll check the system action and driver action continuously. If no proper
- // action was taken in a specified period of time (such as 10 seconds), EStop
- // will be sent to bring the vehicle into emergency full stop.
- optional double safety_mode_trigger_time = 5;
- optional bool require_emergency_stop = 6;
-
- // In simulation mode, the monitor will publish message with this field set,
- // so subscribers could identify it from the recorded messages.
- optional bool is_realtime_in_simulation = 9;
-
- // In some modes, other processes besides modules and monitored components may
- // need to be monitored
- map
other_components = 10; -
- reserved 2, 3;
- }
- void ResourceMonitor::CheckDiskSpace(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status) {
- // Monitor available disk space.
- for (const auto& disk_space : config.disk_spaces()) {
- for (const auto& path : cyber::common::Glob(disk_space.path())) {
- const auto space = boost::filesystem::space(path);
- const int available_gb = static_cast<int>(space.available >> 30);
- if (available_gb < disk_space.insufficient_space_error()) {
- const std::string err =
- absl::StrCat(path, " has insufficient space: ", available_gb,
- "GB < ", disk_space.insufficient_space_error());
- SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
- } else if (available_gb < disk_space.insufficient_space_warning()) {
- const std::string err =
- absl::StrCat(path, " has insufficient space: ", available_gb,
- "GB < ", disk_space.insufficient_space_warning());
- SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
- }
- }
- }
- }
首先需要搞明白参数是什么,config参数 是从manager->GetHMIMode() 中的mode 得到的。
mode 是从下面这个函数得到的。
- HMIConfig HMIWorker::LoadConfig() {
- HMIConfig config;
- // Get available modes, maps and vehicles by listing data directory.
- *config.mutable_modes() =
- ListFilesAsDict(FLAGS_hmi_modes_config_path, ".pb.txt");
- ACHECK(!config.modes().empty())
- << "No modes config loaded from " << FLAGS_hmi_modes_config_path;
-
- *config.mutable_maps() = ListDirAsDict(FLAGS_maps_data_path);
- *config.mutable_vehicles() = ListDirAsDict(FLAGS_vehicles_config_path);
- AINFO << "Loaded HMI config: " << config.DebugString();
- return config;
- }
FLAGS_hmi_modes_config_path 可以search 到,定义在hmi_work.cc中
DEFINE_string(hmi_modes_config_path, "/apollo/modules/dreamview/conf/hmi_modes",
"HMI modes config path.");
可以知道/apollo/modules/dreamview/conf/hmi_modes下有一些后缀是.pb.txt 定义了config。
打开对应目录,下面有下述文件
- ├── camera-Lidar_sensor_calibration.pb.txt
- ├── car_teleop.pb.txt
- ├── console_teleop.pb.txt
- ├── dev_kit_close_loop.pb.txt
- ├── dev_kit_debug.pb.txt
- ├── ipc1_mkz_close_loop.pb.txt
- ├── ipc1_mkz_standard_debug.pb.txt
- ├── ipc2_mkz_close_loop.pb.txt
- ├── ipc2_mkz_standard_debug.pb.txt
- ├── lidar-IMU_sensor_calibration.pb.txt
- ├── map_collection.pb.txt
- ├── mkz_64beam.pb.txt
- ├── mkz_close_loop.pb.txt
- ├── mkz_lgsvl.pb.txt
- ├── mkz_standard_debug.pb.txt
- ├── mkz_standard_debug_hesai.pb.txt
- ├── mkz_standard_debug_smart_recorder.pb.txt
- ├── navigation.pb.txt
- ├── rtk.pb.txt
- └── vehicle_calibration.pb.txt
然后把下面文件中的内容,读入到HMIMode 这个结构中
- message HMIMode {
- map
cyber_modules = 1; - map
modules = 2; - map
monitored_components = 3; - map
other_components = 4; - }
这些文件中都定义了对于监控目标的预值,比如:diskspace
- monitored_components {
- key: "Recorder"
- value: {
- process {
- command_keywords: "cyber_recorder record"
- }
- resource {
- disk_spaces {
- # For logs.
- path: "/apollo/data"
- insufficient_space_warning: 8
- insufficient_space_error: 2
- }
- disk_spaces {
- # For records.
- path: "/media/apollo/internal_nvme"
- insufficient_space_warning: 128
- insufficient_space_error: 32
- }
- }
- }
- }
会定义出文件的存储位置,以及对应的报错预值。
再来看我们的函数主体,就知道
- for (const auto& disk_space : config.disk_spaces()) {
- for (const auto& path : cyber::common::Glob(disk_space.path())) {
- const auto space = boost::filesystem::space(path);
这段代码就是遍历每一个monitor config 中配置的磁盘位置,以及空间的大小。
-
- if (available_gb < disk_space.insufficient_space_error()) {
- const std::string err =
- absl::StrCat(path, " has insufficient space: ", available_gb,
- "GB < ", disk_space.insufficient_space_error());
- SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
- } else if (available_gb < disk_space.insufficient_space_warning()) {
- const std::string err =
- absl::StrCat(path, " has insufficient space: ", available_gb,
- "GB < ", disk_space.insufficient_space_warning());
- SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
- }
这一段就是通过设置的预值来报警。
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
这一段就是用当前最高级的故障来表示磁盘的状态。
SummaryMonitor 属于software monitor 的部分,在software monitor中介绍。
- void ResourceMonitor::CheckCPUUsage(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status) {
- for (const auto& cpu_usage : config.cpu_usages()) {
- const auto process_dag_path = cpu_usage.process_dag_path();
- float cpu_usage_value = 0.f;
- if (process_dag_path.empty()) {
- cpu_usage_value = GetSystemCPUUsage();
- } else {
- int pid = 0;
- if (GetPIDByCmdLine(process_dag_path, &pid)) {
- static std::unordered_map
uint64_t> prev_jiffies_map; - if (prev_jiffies_map.find(process_dag_path) == prev_jiffies_map.end()) {
- prev_jiffies_map[process_dag_path] = 0;
- }
- cpu_usage_value = GetCPUUsage(pid, process_dag_path, &prev_jiffies_map);
- }
- }
- const auto high_cpu_warning = cpu_usage.high_cpu_usage_warning();
- const auto high_cpu_error = cpu_usage.high_cpu_usage_error();
- if (cpu_usage_value > high_cpu_error) {
- const std::string err = absl::StrCat(
- process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
- high_cpu_error, "%");
- SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
- } else if (cpu_usage_value > high_cpu_warning) {
- const std::string warn = absl::StrCat(
- process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
- high_cpu_warning, "%");
- SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
- }
- }
- }
核心逻辑是获取CPU占用的部分和上面将的磁盘空间分析一样。
读取配置,遍历配置,然后根据CPU的占用,给出对应的级别的报警
具体的获取方法是GetSystemCPUUsage,本质上读取/proc/stat 文件,然后通过解析文件获取到的。
- float GetSystemCPUUsage() {
- const std::string system_cpu_stat_file = "/proc/stat";
- const int users = 1, system = 3, total = 7;
- constexpr static int kSystemCpuInfo = 0;
- static uint64_t prev_jiffies = 0, prev_work_jiffies = 0;
- const auto stat_lines =
- GetStatsLines(system_cpu_stat_file, kSystemCpuInfo + 1);
- if (stat_lines.size() <= kSystemCpuInfo) {
- AERROR << "failed to load contents from " << system_cpu_stat_file;
- return 0.f;
- }
- const std::vector
jiffies_stats = - absl::StrSplit(stat_lines[kSystemCpuInfo], ' ', absl::SkipWhitespace());
- if (jiffies_stats.size() <= total) {
- AERROR << "failed to get system CPU info from " << system_cpu_stat_file;
- return 0.f;
- }
- uint64_t jiffies = 0, work_jiffies = 0;
- for (int cur_stat = users; cur_stat <= total; ++cur_stat) {
- const auto cur_stat_value = std::stoll(jiffies_stats[cur_stat]);
- jiffies += cur_stat_value;
- if (cur_stat <= system) {
- work_jiffies += cur_stat_value;
- }
- }
- const uint64_t tmp_prev_jiffies = prev_jiffies;
- const uint64_t tmp_prev_work_jiffies = prev_work_jiffies;
- prev_jiffies = jiffies;
- prev_work_jiffies = work_jiffies;
- if (tmp_prev_jiffies == 0) {
- return 0.f;
- }
- return 100.f * (static_cast<float>(work_jiffies - tmp_prev_work_jiffies) /
- (jiffies - tmp_prev_jiffies));
- }
- void ResourceMonitor::CheckMemoryUsage(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status) {
- for (const auto& memory_usage : config.memory_usages()) {
- const auto process_dag_path = memory_usage.process_dag_path();
- float memory_usage_value = 0.f;
- if (process_dag_path.empty()) {
- memory_usage_value = GetSystemMemoryUsage();
- } else {
- int pid = 0;
- if (GetPIDByCmdLine(process_dag_path, &pid)) {
- memory_usage_value = GetMemoryUsage(pid, process_dag_path);
- }
- }
- const auto high_memory_warning = memory_usage.high_memory_usage_warning();
- const auto high_memory_error = memory_usage.high_memory_usage_error();
- if (memory_usage_value > static_cast<float>(high_memory_error)) {
- const std::string err = absl::StrCat(
- process_dag_path, " has high memory usage: ", memory_usage_value,
- " > ", high_memory_error);
- SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
- } else if (memory_usage_value > static_cast<float>(high_memory_warning)) {
- const std::string warn = absl::StrCat(
- process_dag_path, " has high memory usage: ", memory_usage_value,
- " > ", high_memory_warning);
- SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
- }
- }
- }
- float GetSystemMemoryUsage() {
- const std::string system_mem_stat_file = "/proc/meminfo";
- const int mem_total = 0, mem_free = 1, buffers = 3, cached = 4,
- swap_total = 14, swap_free = 15, slab = 21;
- const auto stat_lines = GetStatsLines(system_mem_stat_file, slab + 1);
- if (stat_lines.size() <= slab) {
- AERROR << "failed to load contents from " << system_mem_stat_file;
- return 0.f;
- }
- const auto total_memory =
- GetSystemMemoryValueFromLine(stat_lines[mem_total]) +
- GetSystemMemoryValueFromLine(stat_lines[swap_total]);
- int64_t used_memory = total_memory;
- for (int cur_line = mem_free; cur_line <= slab; ++cur_line) {
- if (cur_line == mem_free || cur_line == buffers || cur_line == cached ||
- cur_line == swap_free || cur_line == slab) {
- used_memory -= GetSystemMemoryValueFromLine(stat_lines[cur_line]);
- }
- }
- return 100.f * (static_cast<float>(used_memory) / total_memory);
- }
- float GetMemoryUsage(const int pid, const std::string& process_name) {
- const std::string memory_stat_file = absl::StrCat("/proc/", pid, "/statm");
- const uint32_t page_size_kb = (sysconf(_SC_PAGE_SIZE) >> 10);
- const int resident_idx = 1, gb_2_kb = (1 << 20);
- constexpr static int kMemoryInfo = 0;
-
- const auto stat_lines = GetStatsLines(memory_stat_file, kMemoryInfo + 1);
- if (stat_lines.size() <= kMemoryInfo) {
- AERROR << "failed to load contents from " << memory_stat_file;
- return 0.f;
- }
- const std::vector
stats = - absl::StrSplit(stat_lines[kMemoryInfo], ' ', absl::SkipWhitespace());
- if (stats.size() <= resident_idx) {
- AERROR << "failed to get memory info for process " << process_name;
- return 0.f;
- }
- return static_cast<float>(std::stoll(stats[resident_idx]) * page_size_kb) /
- gb_2_kb;
- }
都是读linux 下/proc 下的文件得到的。
- void ResourceMonitor::CheckDiskLoads(
- const apollo::dreamview::ResourceMonitorConfig& config,
- ComponentStatus* status) {
- for (const auto& disk_load : config.disk_load_usages()) {
- const auto disk_load_value = GetSystemDiskload(disk_load.device_name());
- const auto high_disk_load_warning = disk_load.high_disk_load_warning();
- const auto high_disk_load_error = disk_load.high_disk_load_error();
- if (disk_load_value > static_cast<float>(high_disk_load_error)) {
- const std::string err = absl::StrCat(
- disk_load.device_name(), " has high disk load: ", disk_load_value,
- " > ", high_disk_load_error);
- SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
- } else if (disk_load_value > static_cast<float>(high_disk_load_warning)) {
- const std::string warn = absl::StrCat(
- disk_load.device_name(), " has high disk load: ", disk_load_value,
- " > ", high_disk_load_warning);
- SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
- }
- }
- }
- float GetSystemDiskload(const std::string& device_name) {
- const std::string disks_stat_file = "/proc/diskstats";
- const int device = 2, in_out_ms = 12;
- const int seconds_to_ms = 1000;
- constexpr static int kDiskInfo = 128;
- static uint64_t prev_disk_stats = 0;
-
- const auto stat_lines = GetStatsLines(disks_stat_file, kDiskInfo);
- uint64_t disk_stats = 0;
- for (const auto& line : stat_lines) {
- const std::vector
stats = - absl::StrSplit(line, ' ', absl::SkipWhitespace());
- if (stats[device] == device_name) {
- disk_stats = std::stoll(stats[in_out_ms]);
- break;
- }
- }
- const uint64_t tmp_prev_disk_stats = prev_disk_stats;
- prev_disk_stats = disk_stats;
- if (tmp_prev_disk_stats == 0) {
- return 0.f;
- }
- return 100.f *
- (static_cast<float>(disk_stats - tmp_prev_disk_stats) /
- static_cast<float>(FLAGS_resource_monitor_interval * seconds_to_ms));
- }