• Apollo 应用与源码分析:Monitor监控-硬件监控-CPU、磁盘、内存资源监控逻辑源码分析


    目录

    结构分析

    代码

    分析

    执行主体分析

    代码

    分析

    SystemStatus & ComponentStatus

    检查磁盘空间分析

    代码

    分析

    检查CPU 占用

    代码

    分析

    检查内存占用

    代码

    分析

    获取系统的内存占用函数为GetSystemMemoryUsage

    获取进程的内存占用函数为GetMemoryUsage

    检查磁盘负载

    代码

    分析

    获取磁盘负载:GetSystemDiskload


    结构分析

    代码

    1. class ResourceMonitor : public RecurrentRunner {
    2. public:
    3. ResourceMonitor();
    4. void RunOnce(const double current_time) override;
    5. private:
    6. static void UpdateStatus(
    7. const apollo::dreamview::ResourceMonitorConfig& config,
    8. ComponentStatus* status);
    9. static void CheckDiskSpace(
    10. const apollo::dreamview::ResourceMonitorConfig& config,
    11. ComponentStatus* status);
    12. static void CheckCPUUsage(
    13. const apollo::dreamview::ResourceMonitorConfig& config,
    14. ComponentStatus* status);
    15. static void CheckMemoryUsage(
    16. const apollo::dreamview::ResourceMonitorConfig& config,
    17. ComponentStatus* status);
    18. static void CheckDiskLoads(
    19. const apollo::dreamview::ResourceMonitorConfig& config,
    20. ComponentStatus* status);
    21. };

    分析

    1. 继承了RecurrentRunner,重写了RunOnce,说明主要的工作任务就是在RunOnce里面
    2. 有4个静态函数分别检查磁盘空间、CPU使用、内存使用、磁盘负载。
    3. 有一个更新状态的函数UpdateStatus。

    执行主体分析

    代码

    1. void ResourceMonitor::RunOnce(const double current_time) {
    2. auto manager = MonitorManager::Instance();
    3. const auto& mode = manager->GetHMIMode();
    4. auto* components = manager->GetStatus()->mutable_components();
    5. for (const auto& iter : mode.monitored_components()) {
    6. const std::string& name = iter.first;
    7. const auto& config = iter.second;
    8. if (config.has_resource()) {
    9. UpdateStatus(config.resource(),
    10. components->at(name).mutable_resource_status());
    11. }
    12. }
    13. }

    分析

    1. 创建一个monitor manage 的实例;
    2. 根据这个实例获取HMI的配置,得到HMI配置的要监控的component
    3. 遍历要监控的component,并使用updatestatus 更新状态

    这里要认识一些proto ,不然后面的类分析会看不懂。

    SystemStatus & ComponentStatus

    1. message ComponentStatus {
    2. enum Status {
    3. UNKNOWN = 0;
    4. OK = 1;
    5. WARN = 2;
    6. ERROR = 3;
    7. FATAL = 4;
    8. }
    9. optional Status status = 1 [default = UNKNOWN];
    10. optional string message = 2;
    11. }
    12. message Component {
    13. // A summary of all detailed status.
    14. optional ComponentStatus summary = 1;
    15. // Detailed status.
    16. optional ComponentStatus process_status = 2;
    17. optional ComponentStatus channel_status = 3;
    18. optional ComponentStatus resource_status = 4;
    19. optional ComponentStatus other_status = 5;
    20. optional ComponentStatus module_status = 6;
    21. }
    22. message SystemStatus {
    23. optional apollo.common.Header header = 1;
    24. map hmi_modules = 7;
    25. map components = 8;
    26. // Some critical message for passengers. HMI should highlight it or even read
    27. // loudly.
    28. optional string passenger_msg = 4;
    29. // If we have this field, safety_mode should be triggered.
    30. // We'll check the system action and driver action continuously. If no proper
    31. // action was taken in a specified period of time (such as 10 seconds), EStop
    32. // will be sent to bring the vehicle into emergency full stop.
    33. optional double safety_mode_trigger_time = 5;
    34. optional bool require_emergency_stop = 6;
    35. // In simulation mode, the monitor will publish message with this field set,
    36. // so subscribers could identify it from the recorded messages.
    37. optional bool is_realtime_in_simulation = 9;
    38. // In some modes, other processes besides modules and monitored components may
    39. // need to be monitored
    40. map other_components = 10;
    41. reserved 2, 3;
    42. }

    检查磁盘空间分析

    代码

    1. void ResourceMonitor::CheckDiskSpace(
    2. const apollo::dreamview::ResourceMonitorConfig& config,
    3. ComponentStatus* status) {
    4. // Monitor available disk space.
    5. for (const auto& disk_space : config.disk_spaces()) {
    6. for (const auto& path : cyber::common::Glob(disk_space.path())) {
    7. const auto space = boost::filesystem::space(path);
    8. const int available_gb = static_cast<int>(space.available >> 30);
    9. if (available_gb < disk_space.insufficient_space_error()) {
    10. const std::string err =
    11. absl::StrCat(path, " has insufficient space: ", available_gb,
    12. "GB < ", disk_space.insufficient_space_error());
    13. SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    14. } else if (available_gb < disk_space.insufficient_space_warning()) {
    15. const std::string err =
    16. absl::StrCat(path, " has insufficient space: ", available_gb,
    17. "GB < ", disk_space.insufficient_space_warning());
    18. SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
    19. }
    20. }
    21. }
    22. }

    分析

    首先需要搞明白参数是什么,config参数 是从manager->GetHMIMode() 中的mode 得到的。

    mode 是从下面这个函数得到的。

    1. HMIConfig HMIWorker::LoadConfig() {
    2. HMIConfig config;
    3. // Get available modes, maps and vehicles by listing data directory.
    4. *config.mutable_modes() =
    5. ListFilesAsDict(FLAGS_hmi_modes_config_path, ".pb.txt");
    6. ACHECK(!config.modes().empty())
    7. << "No modes config loaded from " << FLAGS_hmi_modes_config_path;
    8. *config.mutable_maps() = ListDirAsDict(FLAGS_maps_data_path);
    9. *config.mutable_vehicles() = ListDirAsDict(FLAGS_vehicles_config_path);
    10. AINFO << "Loaded HMI config: " << config.DebugString();
    11. return config;
    12. }

    FLAGS_hmi_modes_config_path 可以search 到,定义在hmi_work.cc中

    DEFINE_string(hmi_modes_config_path, "/apollo/modules/dreamview/conf/hmi_modes",
                  "HMI modes config path.");
    

    可以知道/apollo/modules/dreamview/conf/hmi_modes下有一些后缀是.pb.txt 定义了config。

    打开对应目录,下面有下述文件

    1. ├── camera-Lidar_sensor_calibration.pb.txt
    2. ├── car_teleop.pb.txt
    3. ├── console_teleop.pb.txt
    4. ├── dev_kit_close_loop.pb.txt
    5. ├── dev_kit_debug.pb.txt
    6. ├── ipc1_mkz_close_loop.pb.txt
    7. ├── ipc1_mkz_standard_debug.pb.txt
    8. ├── ipc2_mkz_close_loop.pb.txt
    9. ├── ipc2_mkz_standard_debug.pb.txt
    10. ├── lidar-IMU_sensor_calibration.pb.txt
    11. ├── map_collection.pb.txt
    12. ├── mkz_64beam.pb.txt
    13. ├── mkz_close_loop.pb.txt
    14. ├── mkz_lgsvl.pb.txt
    15. ├── mkz_standard_debug.pb.txt
    16. ├── mkz_standard_debug_hesai.pb.txt
    17. ├── mkz_standard_debug_smart_recorder.pb.txt
    18. ├── navigation.pb.txt
    19. ├── rtk.pb.txt
    20. └── vehicle_calibration.pb.txt

    然后把下面文件中的内容,读入到HMIMode 这个结构中

    1. message HMIMode {
    2. map cyber_modules = 1;
    3. map modules = 2;
    4. map monitored_components = 3;
    5. map other_components = 4;
    6. }

    这些文件中都定义了对于监控目标的预值,比如:diskspace

    1. monitored_components {
    2. key: "Recorder"
    3. value: {
    4. process {
    5. command_keywords: "cyber_recorder record"
    6. }
    7. resource {
    8. disk_spaces {
    9. # For logs.
    10. path: "/apollo/data"
    11. insufficient_space_warning: 8
    12. insufficient_space_error: 2
    13. }
    14. disk_spaces {
    15. # For records.
    16. path: "/media/apollo/internal_nvme"
    17. insufficient_space_warning: 128
    18. insufficient_space_error: 32
    19. }
    20. }
    21. }
    22. }

    会定义出文件的存储位置,以及对应的报错预值。

    再来看我们的函数主体,就知道

    1. for (const auto& disk_space : config.disk_spaces()) {
    2. for (const auto& path : cyber::common::Glob(disk_space.path())) {
    3. const auto space = boost::filesystem::space(path);

    这段代码就是遍历每一个monitor config 中配置的磁盘位置,以及空间的大小。

    1. if (available_gb < disk_space.insufficient_space_error()) {
    2. const std::string err =
    3. absl::StrCat(path, " has insufficient space: ", available_gb,
    4. "GB < ", disk_space.insufficient_space_error());
    5. SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    6. } else if (available_gb < disk_space.insufficient_space_warning()) {
    7. const std::string err =
    8. absl::StrCat(path, " has insufficient space: ", available_gb,
    9. "GB < ", disk_space.insufficient_space_warning());
    10. SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
    11. }

    这一段就是通过设置的预值来报警。

    SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);

    这一段就是用当前最高级的故障来表示磁盘的状态。

    SummaryMonitor 属于software monitor 的部分,在software monitor中介绍。

    检查CPU 占用

    代码

    1. void ResourceMonitor::CheckCPUUsage(
    2. const apollo::dreamview::ResourceMonitorConfig& config,
    3. ComponentStatus* status) {
    4. for (const auto& cpu_usage : config.cpu_usages()) {
    5. const auto process_dag_path = cpu_usage.process_dag_path();
    6. float cpu_usage_value = 0.f;
    7. if (process_dag_path.empty()) {
    8. cpu_usage_value = GetSystemCPUUsage();
    9. } else {
    10. int pid = 0;
    11. if (GetPIDByCmdLine(process_dag_path, &pid)) {
    12. static std::unordered_mapuint64_t> prev_jiffies_map;
    13. if (prev_jiffies_map.find(process_dag_path) == prev_jiffies_map.end()) {
    14. prev_jiffies_map[process_dag_path] = 0;
    15. }
    16. cpu_usage_value = GetCPUUsage(pid, process_dag_path, &prev_jiffies_map);
    17. }
    18. }
    19. const auto high_cpu_warning = cpu_usage.high_cpu_usage_warning();
    20. const auto high_cpu_error = cpu_usage.high_cpu_usage_error();
    21. if (cpu_usage_value > high_cpu_error) {
    22. const std::string err = absl::StrCat(
    23. process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
    24. high_cpu_error, "%");
    25. SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    26. } else if (cpu_usage_value > high_cpu_warning) {
    27. const std::string warn = absl::StrCat(
    28. process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
    29. high_cpu_warning, "%");
    30. SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    31. }
    32. }
    33. }

    分析

    核心逻辑是获取CPU占用的部分和上面将的磁盘空间分析一样。

    读取配置,遍历配置,然后根据CPU的占用,给出对应的级别的报警

    具体的获取方法是GetSystemCPUUsage,本质上读取/proc/stat 文件,然后通过解析文件获取到的。

    1. float GetSystemCPUUsage() {
    2. const std::string system_cpu_stat_file = "/proc/stat";
    3. const int users = 1, system = 3, total = 7;
    4. constexpr static int kSystemCpuInfo = 0;
    5. static uint64_t prev_jiffies = 0, prev_work_jiffies = 0;
    6. const auto stat_lines =
    7. GetStatsLines(system_cpu_stat_file, kSystemCpuInfo + 1);
    8. if (stat_lines.size() <= kSystemCpuInfo) {
    9. AERROR << "failed to load contents from " << system_cpu_stat_file;
    10. return 0.f;
    11. }
    12. const std::vector jiffies_stats =
    13. absl::StrSplit(stat_lines[kSystemCpuInfo], ' ', absl::SkipWhitespace());
    14. if (jiffies_stats.size() <= total) {
    15. AERROR << "failed to get system CPU info from " << system_cpu_stat_file;
    16. return 0.f;
    17. }
    18. uint64_t jiffies = 0, work_jiffies = 0;
    19. for (int cur_stat = users; cur_stat <= total; ++cur_stat) {
    20. const auto cur_stat_value = std::stoll(jiffies_stats[cur_stat]);
    21. jiffies += cur_stat_value;
    22. if (cur_stat <= system) {
    23. work_jiffies += cur_stat_value;
    24. }
    25. }
    26. const uint64_t tmp_prev_jiffies = prev_jiffies;
    27. const uint64_t tmp_prev_work_jiffies = prev_work_jiffies;
    28. prev_jiffies = jiffies;
    29. prev_work_jiffies = work_jiffies;
    30. if (tmp_prev_jiffies == 0) {
    31. return 0.f;
    32. }
    33. return 100.f * (static_cast<float>(work_jiffies - tmp_prev_work_jiffies) /
    34. (jiffies - tmp_prev_jiffies));
    35. }

    检查内存占用

    代码

    1. void ResourceMonitor::CheckMemoryUsage(
    2. const apollo::dreamview::ResourceMonitorConfig& config,
    3. ComponentStatus* status) {
    4. for (const auto& memory_usage : config.memory_usages()) {
    5. const auto process_dag_path = memory_usage.process_dag_path();
    6. float memory_usage_value = 0.f;
    7. if (process_dag_path.empty()) {
    8. memory_usage_value = GetSystemMemoryUsage();
    9. } else {
    10. int pid = 0;
    11. if (GetPIDByCmdLine(process_dag_path, &pid)) {
    12. memory_usage_value = GetMemoryUsage(pid, process_dag_path);
    13. }
    14. }
    15. const auto high_memory_warning = memory_usage.high_memory_usage_warning();
    16. const auto high_memory_error = memory_usage.high_memory_usage_error();
    17. if (memory_usage_value > static_cast<float>(high_memory_error)) {
    18. const std::string err = absl::StrCat(
    19. process_dag_path, " has high memory usage: ", memory_usage_value,
    20. " > ", high_memory_error);
    21. SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    22. } else if (memory_usage_value > static_cast<float>(high_memory_warning)) {
    23. const std::string warn = absl::StrCat(
    24. process_dag_path, " has high memory usage: ", memory_usage_value,
    25. " > ", high_memory_warning);
    26. SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    27. }
    28. }
    29. }

    分析

    1. 读取配置文件
    2. 遍历配置,判断process_dag_path是否为空,如果是,就读取整个系统的内存占用,如果不是就读取这个进程的内存占用
    3. 根据设定要的预值来判定是不是需要报警

    获取系统的内存占用函数为GetSystemMemoryUsage

    1. float GetSystemMemoryUsage() {
    2. const std::string system_mem_stat_file = "/proc/meminfo";
    3. const int mem_total = 0, mem_free = 1, buffers = 3, cached = 4,
    4. swap_total = 14, swap_free = 15, slab = 21;
    5. const auto stat_lines = GetStatsLines(system_mem_stat_file, slab + 1);
    6. if (stat_lines.size() <= slab) {
    7. AERROR << "failed to load contents from " << system_mem_stat_file;
    8. return 0.f;
    9. }
    10. const auto total_memory =
    11. GetSystemMemoryValueFromLine(stat_lines[mem_total]) +
    12. GetSystemMemoryValueFromLine(stat_lines[swap_total]);
    13. int64_t used_memory = total_memory;
    14. for (int cur_line = mem_free; cur_line <= slab; ++cur_line) {
    15. if (cur_line == mem_free || cur_line == buffers || cur_line == cached ||
    16. cur_line == swap_free || cur_line == slab) {
    17. used_memory -= GetSystemMemoryValueFromLine(stat_lines[cur_line]);
    18. }
    19. }
    20. return 100.f * (static_cast<float>(used_memory) / total_memory);
    21. }

    获取进程的内存占用函数为GetMemoryUsage

    1. float GetMemoryUsage(const int pid, const std::string& process_name) {
    2. const std::string memory_stat_file = absl::StrCat("/proc/", pid, "/statm");
    3. const uint32_t page_size_kb = (sysconf(_SC_PAGE_SIZE) >> 10);
    4. const int resident_idx = 1, gb_2_kb = (1 << 20);
    5. constexpr static int kMemoryInfo = 0;
    6. const auto stat_lines = GetStatsLines(memory_stat_file, kMemoryInfo + 1);
    7. if (stat_lines.size() <= kMemoryInfo) {
    8. AERROR << "failed to load contents from " << memory_stat_file;
    9. return 0.f;
    10. }
    11. const std::vector stats =
    12. absl::StrSplit(stat_lines[kMemoryInfo], ' ', absl::SkipWhitespace());
    13. if (stats.size() <= resident_idx) {
    14. AERROR << "failed to get memory info for process " << process_name;
    15. return 0.f;
    16. }
    17. return static_cast<float>(std::stoll(stats[resident_idx]) * page_size_kb) /
    18. gb_2_kb;
    19. }

    都是读linux 下/proc 下的文件得到的。

    检查磁盘负载

    代码

    1. void ResourceMonitor::CheckDiskLoads(
    2. const apollo::dreamview::ResourceMonitorConfig& config,
    3. ComponentStatus* status) {
    4. for (const auto& disk_load : config.disk_load_usages()) {
    5. const auto disk_load_value = GetSystemDiskload(disk_load.device_name());
    6. const auto high_disk_load_warning = disk_load.high_disk_load_warning();
    7. const auto high_disk_load_error = disk_load.high_disk_load_error();
    8. if (disk_load_value > static_cast<float>(high_disk_load_error)) {
    9. const std::string err = absl::StrCat(
    10. disk_load.device_name(), " has high disk load: ", disk_load_value,
    11. " > ", high_disk_load_error);
    12. SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    13. } else if (disk_load_value > static_cast<float>(high_disk_load_warning)) {
    14. const std::string warn = absl::StrCat(
    15. disk_load.device_name(), " has high disk load: ", disk_load_value,
    16. " > ", high_disk_load_warning);
    17. SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    18. }
    19. }
    20. }

    分析

    1. 分析配置文件,找到磁盘负载配置
    2. 遍历配置,获取磁盘的实际负载,并根据设置的阈值进行故障判断

    获取磁盘负载:GetSystemDiskload

    1. float GetSystemDiskload(const std::string& device_name) {
    2. const std::string disks_stat_file = "/proc/diskstats";
    3. const int device = 2, in_out_ms = 12;
    4. const int seconds_to_ms = 1000;
    5. constexpr static int kDiskInfo = 128;
    6. static uint64_t prev_disk_stats = 0;
    7. const auto stat_lines = GetStatsLines(disks_stat_file, kDiskInfo);
    8. uint64_t disk_stats = 0;
    9. for (const auto& line : stat_lines) {
    10. const std::vector stats =
    11. absl::StrSplit(line, ' ', absl::SkipWhitespace());
    12. if (stats[device] == device_name) {
    13. disk_stats = std::stoll(stats[in_out_ms]);
    14. break;
    15. }
    16. }
    17. const uint64_t tmp_prev_disk_stats = prev_disk_stats;
    18. prev_disk_stats = disk_stats;
    19. if (tmp_prev_disk_stats == 0) {
    20. return 0.f;
    21. }
    22. return 100.f *
    23. (static_cast<float>(disk_stats - tmp_prev_disk_stats) /
    24. static_cast<float>(FLAGS_resource_monitor_interval * seconds_to_ms));
    25. }

  • 相关阅读:
    Spring 面试 63 问!
    WordPress主题开发( 八)之—— 模板循环详细用法
    Qt多工程同名字段自动翻译工具
    Springboot整合RabbitMQ详解
    中国金刚石工具市场发展现状及供需格局分析预测报告
    Jenkins-jenkins变量
    【Mysql系列】(一)MySQL语句执行流程
    Exploring the Potential of Large Language Models (LLMs) in Learning on Graphs
    单片机——仿真软件Proteus基本使用教程
    认识 https 以及 https的通信流程
  • 原文地址:https://blog.csdn.net/qq_32378713/article/details/128057369