目录
定位监控
代码
分析
备注
camera监控
代码
分析
功能安全监控
代码
分析
CheckSafty函数分析
RunOnce 函数分析
记录功能监控
代码
分析
SmartRecorderStatus proto
状态的上报位置分析
监控信息汇总服务
代码
分析
class LocalizationMonitor : public RecurrentRunner {public:LocalizationMonitor();void RunOnce(const double current_time) override;
};void LocalizationMonitor::RunOnce(const double current_time) {auto manager = MonitorManager::Instance();auto* component = apollo::common::util::FindOrNull(*manager->GetStatus()->mutable_components(),FLAGS_localization_component_name);if (component == nullptr) {// localization is not monitored in current mode, skip.return;}static auto reader =manager->CreateReader(FLAGS_localization_msf_status);reader->Observe();const auto status = reader->GetLatestObserved();ComponentStatus* component_status = component->mutable_other_status();component_status->clear_status();if (status == nullptr) {SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,"No LocalizationStatus received",component_status);return;}// Translate LocalizationStatus to ComponentStatus. Note that ERROR and FATAL// will trigger safety mode in current settings.switch (status->fusion_status()) {case MeasureState::OK:SummaryMonitor::EscalateStatus(ComponentStatus::OK, "", component_status);break;case MeasureState::WARNNING:SummaryMonitor::EscalateStatus(ComponentStatus::WARN,absl::StrCat("WARNNING: ", status->state_message()),component_status);break;case MeasureState::ERROR:SummaryMonitor::EscalateStatus(ComponentStatus::WARN,absl::StrCat("ERROR: ", status->state_message()), component_status);break;case MeasureState::CRITICAL_ERROR:SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,absl::StrCat("CRITICAL_ERROR: ", status->state_message()),component_status);break;case MeasureState::FATAL_ERROR:SummaryMonitor::EscalateStatus(ComponentStatus::FATAL,absl::StrCat("FATAL_ERROR: ", status->state_message()),component_status);break;default:AFATAL << "Unknown fusion_status: " << status->fusion_status();break;}
}
## Check MSF Localization Status We provide a simple way to check lidar localization, GNSS localization and fusion localization status. There are four states {NOT_VALID, NOT_STABLE, OK, VALID} for localization status. You can simply use `rostopic echo /apollo/localization/msf_status` to check localization status. If fusion_status is VALID or OK, the output of msf localization is reliable.
上述是apollo MSF 定位状态的判断逻辑,上述故障都是由业务模块定位部分设置并发出的。
下面是modules/localization/rtk/rtk_localization.cc的状态检测部分
void RTKLocalization::FillLocalizationStatusMsg(const drivers::gnss::InsStat &status,LocalizationStatus *localization_status) {apollo::common::Header *header = localization_status->mutable_header();double timestamp = apollo::cyber::Clock::NowInSeconds();header->set_timestamp_sec(timestamp);localization_status->set_measurement_time(status.header().timestamp_sec());if (!status.has_pos_type()) {localization_status->set_fusion_status(MeasureState::ERROR);localization_status->set_state_message("Error: Current Localization Status Is Missing.");return;}
class CameraMonitor : public RecurrentRunner {public:CameraMonitor();void RunOnce(const double current_time) override;private:static void UpdateStatus(ComponentStatus* status);
};
void CameraMonitor::RunOnce(const double current_time) {auto* manager = MonitorManager::Instance();auto* component = apollo::common::util::FindOrNull(*manager->GetStatus()->mutable_components(), FLAGS_camera_component_name);if (component == nullptr) {// camera is not monitored in current mode, skip.return;}auto* status = component->mutable_other_status();UpdateStatus(status);
}
除了判断camera是不是被配置为监控配置之外核心函数在UpdateStatus 中
void CameraMonitor::UpdateStatus(ComponentStatus* status) {status->clear_status();std::string frame_id = "";for (const auto& topic : camera_topic_set) {const auto& reader_message_pair = CreateReaderAndLatestsMessage(topic);const auto& reader = reader_message_pair.first;const auto& message = reader_message_pair.second;if (reader != nullptr && message != nullptr) {if (frame_id.empty()) {const auto& header = message->header();if (header.has_frame_id()) {frame_id = header.frame_id();}} else {SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,absl::StrCat("Only one camera is permitted"), status);}}}if (frame_id.empty()) {SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, absl::StrCat("No camera is detected"), status);} else {SummaryMonitor::EscalateStatus(ComponentStatus::OK, absl::StrCat("Detected one camera: ", frame_id),status);}
}
static const auto camera_topic_set = std::set{FLAGS_image_long_topic, FLAGS_camera_image_long_topic,FLAGS_camera_image_short_topic, FLAGS_camera_front_6mm_topic,FLAGS_camera_front_6mm_2_topic, FLAGS_camera_front_12mm_topic,// Add more cameras here if you want to monitor.
};
absl::StrCat("Only one camera is permitted"), status);
如果frame id 是 empty,就报ERROR
ComponentStatus::ERROR, absl::StrCat("No camera is detected"), status);
// Check if we need to switch to safe mode, and then
// 1. Notify driver to take action.
// 2. Trigger Guardian if no proper action was taken.
class FunctionalSafetyMonitor : public RecurrentRunner {public:FunctionalSafetyMonitor();void RunOnce(const double current_time);private:bool CheckSafety();
};
void FunctionalSafetyMonitor::RunOnce(const double current_time) {auto* system_status = MonitorManager::Instance()->GetStatus();// Everything looks good or has been handled properly.if (CheckSafety()) {system_status->clear_passenger_msg();system_status->clear_safety_mode_trigger_time();system_status->clear_require_emergency_stop();return;}if (system_status->require_emergency_stop()) {// EStop has already been triggered.return;}// Newly entered safety mode.system_status->set_passenger_msg("Error! Please disengage.");if (!system_status->has_safety_mode_trigger_time()) {system_status->set_safety_mode_trigger_time(current_time);return;}// Trigger EStop if no action was taken in time.if (system_status->safety_mode_trigger_time() +FLAGS_safety_mode_seconds_before_estop set_require_emergency_stop(true);}
}
bool FunctionalSafetyMonitor::CheckSafety() {// We only check safety in self driving mode.auto manager = MonitorManager::Instance();if (!manager->IsInAutonomousMode()) {return true;}// Check HMI modules status.const auto& mode = manager->GetHMIMode();const auto& hmi_modules = manager->GetStatus()->hmi_modules();for (const auto& iter : mode.modules()) {const std::string& module_name = iter.first;const auto& module = iter.second;if (module.required_for_safety() &&!IsSafe(module_name, hmi_modules.at(module_name))) {return false;}}// Check monitored components status.const auto& components = manager->GetStatus()->components();for (const auto& iter : mode.monitored_components()) {const std::string& component_name = iter.first;const auto& component = iter.second;if (component.required_for_safety() &&!IsSafe(component_name, components.at(component_name).summary())) {return false;}}// Everything looks good.return true;
}
recorder monitor 是对于是apollo 对于记录服务的监控,方法是通过订阅/apollo/data/recorder/status 这个topic 获取Recorder status。
class RecorderMonitor : public RecurrentRunner {public:RecorderMonitor();void RunOnce(const double current_time) override;
};void RecorderMonitor::RunOnce(const double current_time) {auto manager = MonitorManager::Instance();auto* component = apollo::common::util::FindOrNull(*manager->GetStatus()->mutable_components(),FLAGS_smart_recorder_component_name);if (component == nullptr) {// SmartRecorder is not monitored in current mode, skip.return;}static auto reader =manager->CreateReader(FLAGS_recorder_status_topic);reader->Observe();const auto status = reader->GetLatestObserved();ComponentStatus* component_status = component->mutable_other_status();component_status->clear_status();if (status == nullptr) {SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,"No SmartRecorderStatus received",component_status);return;}// Translate SmartRecorderStatus to ComponentStatus. Note that ERROR and FATAL// will trigger safety mode in current settings.switch (status->recording_state()) {case RecordingState::RECORDING:SummaryMonitor::EscalateStatus(ComponentStatus::OK, "", component_status);break;case RecordingState::TERMINATING:SummaryMonitor::EscalateStatus(ComponentStatus::WARN,absl::StrCat("WARNNING: ", status->state_message()),component_status);break;case RecordingState::STOPPED:SummaryMonitor::EscalateStatus(ComponentStatus::OK,absl::StrCat("STOPPED: ", status->state_message()), component_status);break;default:AFATAL << "Unknown recording status: " << status->recording_state();break;}
}
第一步依旧是判断recorder 是不是被配置的监控模块,如果不是直接返回。
然后就是直接判断status->recording_state(),如果是RecordingState::TERMINATING(终止)状态就报出一个WARNING 的故障
enum RecordingState {STOPPED = 0;RECORDING = 1;TERMINATING = 2;
}message SmartRecorderStatus {optional apollo.common.Header header = 1;optional RecordingState recording_state = 2;optional string state_message = 3;
}
modules/data/tools/smart_recorder/realtime_record_processor.cc
我们可以在上述文件中找到recorder状态赋值情况,但是可惜apollo 中目前没有一个模块会主动填写RecordingState::TERMINATING(终止)状态。
// A monitor which summarize other monitors' result and publish the whole status
// if it has changed.
class SummaryMonitor : public RecurrentRunner {public:SummaryMonitor();void RunOnce(const double current_time) override;// Escalate the status to a higher priority new status:// FATAL > ERROR > WARN > OK > UNKNOWN.static void EscalateStatus(const ComponentStatus::Status new_status,const std::string& message,ComponentStatus* current_status);private:size_t system_status_fp_ = 0;double last_broadcast_ = 0;
};void SummaryMonitor::RunOnce(const double current_time) {auto manager = MonitorManager::Instance();auto* status = manager->GetStatus();// Escalate the summary status to the most severe one.for (auto& component : *status->mutable_components()) {auto* summary = component.second.mutable_summary();const auto& process_status = component.second.process_status();EscalateStatus(process_status.status(), process_status.message(), summary);const auto& module_status = component.second.module_status();EscalateStatus(module_status.status(), module_status.message(), summary);const auto& channel_status = component.second.channel_status();EscalateStatus(channel_status.status(), channel_status.message(), summary);const auto& resource_status = component.second.resource_status();EscalateStatus(resource_status.status(), resource_status.message(),summary);const auto& other_status = component.second.other_status();EscalateStatus(other_status.status(), other_status.message(), summary);}// Get fingerprint of current status.// Don't use DebugString() which has known bug on Map field. The string// doesn't change though the value has changed.static std::hash hash_fn;std::string proto_bytes;status->SerializeToString(&proto_bytes);const size_t new_fp = hash_fn(proto_bytes);if (system_status_fp_ != new_fp ||current_time - last_broadcast_ > FLAGS_system_status_publish_interval) {static auto writer =manager->CreateWriter(FLAGS_system_status_topic);apollo::common::util::FillHeader("SystemMonitor", status);writer->Write(*status);status->clear_header();system_status_fp_ = new_fp;last_broadcast_ = current_time;}
}
针对前面所有的monitor 上报的故障信息,进行一个整合,然后发送到/apollo/monitor/system_status这个topic 上。