From 7cd464bc6a724b506dea5fd1d35293b45f9a92d3 Mon Sep 17 00:00:00 2001 From: Mock Date: Mon, 13 Apr 2026 21:55:17 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=98=BE=E7=A4=BA=E6=9C=BA=E5=99=A8?= =?UTF-8?q?=E4=BA=BA=E6=95=85=E9=9A=9C=E5=8E=9F=E5=9B=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/monitoring/telemetry.py | 118 +++++++++++++++++++++++ backend/monitoring/video.py | 2 + frontend/src/components/NetworkPanel.vue | 4 + frontend/src/types.ts | 10 ++ 4 files changed, 134 insertions(+) diff --git a/backend/monitoring/telemetry.py b/backend/monitoring/telemetry.py index 09de52a..e29cd72 100644 --- a/backend/monitoring/telemetry.py +++ b/backend/monitoring/telemetry.py @@ -2,10 +2,12 @@ from __future__ import annotations from collections import deque import json +import os import sys import threading import time from datetime import datetime, timezone +from pathlib import Path from typing import Any from .common import ( @@ -21,6 +23,23 @@ from .video import FrameTrailerMetadata, OmniSocketVideoReceiver LOCAL_SAMPLE_INTERVAL_MS = 500 TREND_HISTORY_SIZE = 10 TREND_WINDOW_SIZE = 5 +BLITZ_RUNTIME_DIR = Path(os.getenv("BLITZ_RUNTIME_DIR", "/run/blitz-robot")) +WATCHDOG_STATUS_PATH = BLITZ_RUNTIME_DIR / "watchdog.status.json" +WATCHDOG_STATUS_STALE_MS = max(int(os.getenv("BLITZ_HEALTH_STALE_SEC", "15")), 1) * 1000 +WATCHDOG_FAULT_REASON_MAP: dict[str, tuple[str, str | None]] = { + "": ("none", None), + "none": ("none", None), + "camera_missing": ("video_pipeline_stalled", "degraded"), + "camera_recovered": ("video_session_recovering", "recovering"), + "camera-reappeared-escalated": ("video_session_recovering", "recovering"), + "bside_status_stale": ("video_session_recovering", "recovering"), + "bside-unhealthy-escalated": ("video_session_recovering", "recovering"), + "ros_receiver_unhealthy": ("control_session_recovering", "recovering"), + "ros-unhealthy": ("control_session_recovering", "recovering"), + "network_or_robot_unreachable": ("network_or_robot_unreachable", "recovering"), + "network-recovered-ros-unhealthy": ("control_session_recovering", "recovering"), + "network-recovered-escalated": ("network_or_robot_unreachable", "recovering"), +} def _utc_from_epoch(epoch_seconds: float | None) -> str | None: @@ -47,6 +66,19 @@ def _coerce_float(value: Any, default: float = 0.0) -> float: return default +def _load_optional_json(path: Path) -> dict[str, Any] | None: + try: + if not path.exists(): + return None + with path.open("r", encoding="utf-8") as file: + payload = json.load(file) + if isinstance(payload, dict): + return payload + except Exception: + return None + return None + + class GpsDataService: def __init__(self, receiver: OmniSocketVideoReceiver) -> None: self._receiver = receiver @@ -523,6 +555,80 @@ class NetworkTelemetryService: return session return None + def _derive_robot_health( + self, + *, + video_receiver_status: dict[str, Any], + local_control_registered: bool, + remote_control_fresh: bool, + remote_video_fresh: bool, + telemetry_state: dict[str, Any], + watchdog_status: dict[str, Any] | None, + ) -> dict[str, Any]: + if watchdog_status is not None: + explicit_health = self._derive_robot_health_from_watchdog(watchdog_status) + if explicit_health is not None: + return explicit_health + + has_recent_frame = bool(video_receiver_status.get("has_recent_frame")) + telemetry_connected = bool(telemetry_state.get("connected")) + telemetry_stale = bool(telemetry_state.get("stale", True)) + + if has_recent_frame and remote_control_fresh and remote_video_fresh: + fault_reason = "none" + recovery_state = "ok" + elif not remote_control_fresh and not remote_video_fresh and not has_recent_frame: + fault_reason = "network_or_robot_unreachable" + recovery_state = "recovering" if telemetry_connected and not telemetry_stale else "degraded" + elif remote_control_fresh and not remote_video_fresh: + fault_reason = "video_session_recovering" + recovery_state = "recovering" + elif not remote_control_fresh and local_control_registered: + fault_reason = "control_session_recovering" + recovery_state = "recovering" + elif remote_control_fresh and not has_recent_frame: + fault_reason = "video_pipeline_stalled" + recovery_state = "degraded" + else: + fault_reason = "unknown" + recovery_state = "degraded" + + return { + "fault_reason": fault_reason, + "recovery_state": recovery_state, + "confidence": "derived", + "updated_at": utc_iso_now(), + } + + def _derive_robot_health_from_watchdog(self, watchdog_status: dict[str, Any]) -> dict[str, Any] | None: + updated_at_epoch_ms = _coerce_int(watchdog_status.get("updated_at_epoch_ms")) + if updated_at_epoch_ms <= 0: + return None + + now_epoch_ms = int(time.time() * 1000) + if now_epoch_ms - updated_at_epoch_ms > WATCHDOG_STATUS_STALE_MS: + return None + + raw_fault_reason = str(watchdog_status.get("fault_reason", "") or "") + raw_recovery_state = str(watchdog_status.get("recovery_state", "") or "") + normalized_fault_reason = "unknown" + normalized_recovery_state = raw_recovery_state or "degraded" + mapped_health = WATCHDOG_FAULT_REASON_MAP.get(raw_fault_reason) + + if mapped_health is not None: + normalized_fault_reason, recovery_override = mapped_health + if recovery_override is not None: + normalized_recovery_state = recovery_override + if raw_recovery_state == "backoff": + normalized_recovery_state = "backoff" + + return { + "fault_reason": normalized_fault_reason, + "recovery_state": normalized_recovery_state, + "confidence": "derived", + "updated_at": _utc_from_epoch(updated_at_epoch_ms / 1000.0) or utc_iso_now(), + } + def get_latest(self) -> dict[str, Any]: self._ensure_started() @@ -535,6 +641,7 @@ class NetworkTelemetryService: control_app = self._control_sender.session_stats() video_kcp = self._video_receiver.session_kcp_stats() control_kcp = self._control_sender.session_kcp_stats() + video_receiver_status = self._video_receiver.get_status() arbiter_status = self._control_arbiter.get_status() ingress_status = self._native_ingress.get_status() sender_status = self._control_sender.get_status() @@ -607,6 +714,16 @@ class NetworkTelemetryService: jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None local_control_registered = bool(control_app.get("registered", 0)) remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale")) + remote_video_fresh = bool(remote_sessions["video"].get("connected")) and not bool(remote_sessions["video"].get("stale")) + watchdog_status = _load_optional_json(WATCHDOG_STATUS_PATH) + robot_health = self._derive_robot_health( + video_receiver_status=video_receiver_status, + local_control_registered=local_control_registered, + remote_control_fresh=remote_control_fresh, + remote_video_fresh=remote_video_fresh, + telemetry_state=telemetry_state, + watchdog_status=watchdog_status, + ) if local_control_registered and remote_control_fresh: peer_status = "online" @@ -657,6 +774,7 @@ class NetworkTelemetryService: "last_server_error": str(telemetry_state.get("last_server_error", "") or ""), "reconnect_count": int(telemetry_state.get("reconnect_count", 0)), }, + "robot_health": robot_health, "ingress": { "native_udp": ingress_status, }, diff --git a/backend/monitoring/video.py b/backend/monitoring/video.py index 05b9b5d..c17cedc 100644 --- a/backend/monitoring/video.py +++ b/backend/monitoring/video.py @@ -40,6 +40,8 @@ class FrameTrailerMetadata: timestamp_ns: int latitude: float longitude: float + raw_latitude_hex: str + raw_longitude_hex: str received_at: float diff --git a/frontend/src/components/NetworkPanel.vue b/frontend/src/components/NetworkPanel.vue index cdbe187..29e15f9 100644 --- a/frontend/src/components/NetworkPanel.vue +++ b/frontend/src/components/NetworkPanel.vue @@ -84,6 +84,10 @@ function legSessions(link: LinkTelemetry | null): Array<{ name: string; data: Li
+

Robot Fault: {{ network?.robot_health?.fault_reason ?? 'n/a' }}

+

Recovery State: {{ network?.robot_health?.recovery_state ?? 'n/a' }}

+

Health Confidence: {{ network?.robot_health?.confidence ?? 'n/a' }}

+

Health Updated: {{ formatTime(network?.robot_health?.updated_at) }}

Transport: {{ network?.transport ?? 'n/a' }} / {{ network?.source_mode ?? 'n/a' }}

Telemetry Peer: {{ network?.telemetry_receiver?.peer_id ?? 'n/a' }}

Telemetry Registered: {{ network?.telemetry_receiver?.registered ? 'yes' : 'no' }}

diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 0249435..55709c3 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -3,6 +3,8 @@ export interface GpsTelemetry { utc_time: string latitude: number | null longitude: number | null + raw_latitude_hex?: string + raw_longitude_hex?: string satellites: number | null altitude_m: number | null coordinate_system: string @@ -142,6 +144,13 @@ export interface TelemetryReceiverStatus { reconnect_count: number } +export interface RobotHealthStatus { + fault_reason: string + recovery_state: string + confidence: string + updated_at: string +} + export interface NetworkTelemetry { peer_status: string latency_ms: number | null @@ -170,6 +179,7 @@ export interface NetworkTelemetry { d_to_b: LinkTelemetry } telemetry_receiver: TelemetryReceiverStatus + robot_health: RobotHealthStatus ingress: { native_udp: NativeUdpIngress }