feat: 显示机器人故障原因

This commit is contained in:
2026-04-13 21:55:17 +08:00
parent 906428fc3b
commit 7cd464bc6a
4 changed files with 134 additions and 0 deletions

View File

@@ -2,10 +2,12 @@ from __future__ import annotations
from collections import deque from collections import deque
import json import json
import os
import sys import sys
import threading import threading
import time import time
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path
from typing import Any from typing import Any
from .common import ( from .common import (
@@ -21,6 +23,23 @@ from .video import FrameTrailerMetadata, OmniSocketVideoReceiver
LOCAL_SAMPLE_INTERVAL_MS = 500 LOCAL_SAMPLE_INTERVAL_MS = 500
TREND_HISTORY_SIZE = 10 TREND_HISTORY_SIZE = 10
TREND_WINDOW_SIZE = 5 TREND_WINDOW_SIZE = 5
BLITZ_RUNTIME_DIR = Path(os.getenv("BLITZ_RUNTIME_DIR", "/run/blitz-robot"))
WATCHDOG_STATUS_PATH = BLITZ_RUNTIME_DIR / "watchdog.status.json"
WATCHDOG_STATUS_STALE_MS = max(int(os.getenv("BLITZ_HEALTH_STALE_SEC", "15")), 1) * 1000
WATCHDOG_FAULT_REASON_MAP: dict[str, tuple[str, str | None]] = {
"": ("none", None),
"none": ("none", None),
"camera_missing": ("video_pipeline_stalled", "degraded"),
"camera_recovered": ("video_session_recovering", "recovering"),
"camera-reappeared-escalated": ("video_session_recovering", "recovering"),
"bside_status_stale": ("video_session_recovering", "recovering"),
"bside-unhealthy-escalated": ("video_session_recovering", "recovering"),
"ros_receiver_unhealthy": ("control_session_recovering", "recovering"),
"ros-unhealthy": ("control_session_recovering", "recovering"),
"network_or_robot_unreachable": ("network_or_robot_unreachable", "recovering"),
"network-recovered-ros-unhealthy": ("control_session_recovering", "recovering"),
"network-recovered-escalated": ("network_or_robot_unreachable", "recovering"),
}
def _utc_from_epoch(epoch_seconds: float | None) -> str | None: def _utc_from_epoch(epoch_seconds: float | None) -> str | None:
@@ -47,6 +66,19 @@ def _coerce_float(value: Any, default: float = 0.0) -> float:
return default return default
def _load_optional_json(path: Path) -> dict[str, Any] | None:
try:
if not path.exists():
return None
with path.open("r", encoding="utf-8") as file:
payload = json.load(file)
if isinstance(payload, dict):
return payload
except Exception:
return None
return None
class GpsDataService: class GpsDataService:
def __init__(self, receiver: OmniSocketVideoReceiver) -> None: def __init__(self, receiver: OmniSocketVideoReceiver) -> None:
self._receiver = receiver self._receiver = receiver
@@ -523,6 +555,80 @@ class NetworkTelemetryService:
return session return session
return None return None
def _derive_robot_health(
self,
*,
video_receiver_status: dict[str, Any],
local_control_registered: bool,
remote_control_fresh: bool,
remote_video_fresh: bool,
telemetry_state: dict[str, Any],
watchdog_status: dict[str, Any] | None,
) -> dict[str, Any]:
if watchdog_status is not None:
explicit_health = self._derive_robot_health_from_watchdog(watchdog_status)
if explicit_health is not None:
return explicit_health
has_recent_frame = bool(video_receiver_status.get("has_recent_frame"))
telemetry_connected = bool(telemetry_state.get("connected"))
telemetry_stale = bool(telemetry_state.get("stale", True))
if has_recent_frame and remote_control_fresh and remote_video_fresh:
fault_reason = "none"
recovery_state = "ok"
elif not remote_control_fresh and not remote_video_fresh and not has_recent_frame:
fault_reason = "network_or_robot_unreachable"
recovery_state = "recovering" if telemetry_connected and not telemetry_stale else "degraded"
elif remote_control_fresh and not remote_video_fresh:
fault_reason = "video_session_recovering"
recovery_state = "recovering"
elif not remote_control_fresh and local_control_registered:
fault_reason = "control_session_recovering"
recovery_state = "recovering"
elif remote_control_fresh and not has_recent_frame:
fault_reason = "video_pipeline_stalled"
recovery_state = "degraded"
else:
fault_reason = "unknown"
recovery_state = "degraded"
return {
"fault_reason": fault_reason,
"recovery_state": recovery_state,
"confidence": "derived",
"updated_at": utc_iso_now(),
}
def _derive_robot_health_from_watchdog(self, watchdog_status: dict[str, Any]) -> dict[str, Any] | None:
updated_at_epoch_ms = _coerce_int(watchdog_status.get("updated_at_epoch_ms"))
if updated_at_epoch_ms <= 0:
return None
now_epoch_ms = int(time.time() * 1000)
if now_epoch_ms - updated_at_epoch_ms > WATCHDOG_STATUS_STALE_MS:
return None
raw_fault_reason = str(watchdog_status.get("fault_reason", "") or "")
raw_recovery_state = str(watchdog_status.get("recovery_state", "") or "")
normalized_fault_reason = "unknown"
normalized_recovery_state = raw_recovery_state or "degraded"
mapped_health = WATCHDOG_FAULT_REASON_MAP.get(raw_fault_reason)
if mapped_health is not None:
normalized_fault_reason, recovery_override = mapped_health
if recovery_override is not None:
normalized_recovery_state = recovery_override
if raw_recovery_state == "backoff":
normalized_recovery_state = "backoff"
return {
"fault_reason": normalized_fault_reason,
"recovery_state": normalized_recovery_state,
"confidence": "derived",
"updated_at": _utc_from_epoch(updated_at_epoch_ms / 1000.0) or utc_iso_now(),
}
def get_latest(self) -> dict[str, Any]: def get_latest(self) -> dict[str, Any]:
self._ensure_started() self._ensure_started()
@@ -535,6 +641,7 @@ class NetworkTelemetryService:
control_app = self._control_sender.session_stats() control_app = self._control_sender.session_stats()
video_kcp = self._video_receiver.session_kcp_stats() video_kcp = self._video_receiver.session_kcp_stats()
control_kcp = self._control_sender.session_kcp_stats() control_kcp = self._control_sender.session_kcp_stats()
video_receiver_status = self._video_receiver.get_status()
arbiter_status = self._control_arbiter.get_status() arbiter_status = self._control_arbiter.get_status()
ingress_status = self._native_ingress.get_status() ingress_status = self._native_ingress.get_status()
sender_status = self._control_sender.get_status() sender_status = self._control_sender.get_status()
@@ -607,6 +714,16 @@ class NetworkTelemetryService:
jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None
local_control_registered = bool(control_app.get("registered", 0)) local_control_registered = bool(control_app.get("registered", 0))
remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale")) remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale"))
remote_video_fresh = bool(remote_sessions["video"].get("connected")) and not bool(remote_sessions["video"].get("stale"))
watchdog_status = _load_optional_json(WATCHDOG_STATUS_PATH)
robot_health = self._derive_robot_health(
video_receiver_status=video_receiver_status,
local_control_registered=local_control_registered,
remote_control_fresh=remote_control_fresh,
remote_video_fresh=remote_video_fresh,
telemetry_state=telemetry_state,
watchdog_status=watchdog_status,
)
if local_control_registered and remote_control_fresh: if local_control_registered and remote_control_fresh:
peer_status = "online" peer_status = "online"
@@ -657,6 +774,7 @@ class NetworkTelemetryService:
"last_server_error": str(telemetry_state.get("last_server_error", "") or ""), "last_server_error": str(telemetry_state.get("last_server_error", "") or ""),
"reconnect_count": int(telemetry_state.get("reconnect_count", 0)), "reconnect_count": int(telemetry_state.get("reconnect_count", 0)),
}, },
"robot_health": robot_health,
"ingress": { "ingress": {
"native_udp": ingress_status, "native_udp": ingress_status,
}, },

View File

@@ -40,6 +40,8 @@ class FrameTrailerMetadata:
timestamp_ns: int timestamp_ns: int
latitude: float latitude: float
longitude: float longitude: float
raw_latitude_hex: str
raw_longitude_hex: str
received_at: float received_at: float

View File

@@ -84,6 +84,10 @@ function legSessions(link: LinkTelemetry | null): Array<{ name: string; data: Li
</div> </div>
<div class="summary telemetry-strip"> <div class="summary telemetry-strip">
<p><strong>Robot Fault:</strong> {{ network?.robot_health?.fault_reason ?? 'n/a' }}</p>
<p><strong>Recovery State:</strong> {{ network?.robot_health?.recovery_state ?? 'n/a' }}</p>
<p><strong>Health Confidence:</strong> {{ network?.robot_health?.confidence ?? 'n/a' }}</p>
<p><strong>Health Updated:</strong> {{ formatTime(network?.robot_health?.updated_at) }}</p>
<p><strong>Transport:</strong> {{ network?.transport ?? 'n/a' }} / {{ network?.source_mode ?? 'n/a' }}</p> <p><strong>Transport:</strong> {{ network?.transport ?? 'n/a' }} / {{ network?.source_mode ?? 'n/a' }}</p>
<p><strong>Telemetry Peer:</strong> {{ network?.telemetry_receiver?.peer_id ?? 'n/a' }}</p> <p><strong>Telemetry Peer:</strong> {{ network?.telemetry_receiver?.peer_id ?? 'n/a' }}</p>
<p><strong>Telemetry Registered:</strong> {{ network?.telemetry_receiver?.registered ? 'yes' : 'no' }}</p> <p><strong>Telemetry Registered:</strong> {{ network?.telemetry_receiver?.registered ? 'yes' : 'no' }}</p>

View File

@@ -3,6 +3,8 @@ export interface GpsTelemetry {
utc_time: string utc_time: string
latitude: number | null latitude: number | null
longitude: number | null longitude: number | null
raw_latitude_hex?: string
raw_longitude_hex?: string
satellites: number | null satellites: number | null
altitude_m: number | null altitude_m: number | null
coordinate_system: string coordinate_system: string
@@ -142,6 +144,13 @@ export interface TelemetryReceiverStatus {
reconnect_count: number reconnect_count: number
} }
export interface RobotHealthStatus {
fault_reason: string
recovery_state: string
confidence: string
updated_at: string
}
export interface NetworkTelemetry { export interface NetworkTelemetry {
peer_status: string peer_status: string
latency_ms: number | null latency_ms: number | null
@@ -170,6 +179,7 @@ export interface NetworkTelemetry {
d_to_b: LinkTelemetry d_to_b: LinkTelemetry
} }
telemetry_receiver: TelemetryReceiverStatus telemetry_receiver: TelemetryReceiverStatus
robot_health: RobotHealthStatus
ingress: { ingress: {
native_udp: NativeUdpIngress native_udp: NativeUdpIngress
} }