feat: 显示机器人故障原因

This commit is contained in:
2026-04-13 21:55:17 +08:00
parent 906428fc3b
commit 7cd464bc6a
4 changed files with 134 additions and 0 deletions

View File

@@ -2,10 +2,12 @@ from __future__ import annotations
from collections import deque
import json
import os
import sys
import threading
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from .common import (
@@ -21,6 +23,23 @@ from .video import FrameTrailerMetadata, OmniSocketVideoReceiver
LOCAL_SAMPLE_INTERVAL_MS = 500
TREND_HISTORY_SIZE = 10
TREND_WINDOW_SIZE = 5
BLITZ_RUNTIME_DIR = Path(os.getenv("BLITZ_RUNTIME_DIR", "/run/blitz-robot"))
WATCHDOG_STATUS_PATH = BLITZ_RUNTIME_DIR / "watchdog.status.json"
WATCHDOG_STATUS_STALE_MS = max(int(os.getenv("BLITZ_HEALTH_STALE_SEC", "15")), 1) * 1000
WATCHDOG_FAULT_REASON_MAP: dict[str, tuple[str, str | None]] = {
"": ("none", None),
"none": ("none", None),
"camera_missing": ("video_pipeline_stalled", "degraded"),
"camera_recovered": ("video_session_recovering", "recovering"),
"camera-reappeared-escalated": ("video_session_recovering", "recovering"),
"bside_status_stale": ("video_session_recovering", "recovering"),
"bside-unhealthy-escalated": ("video_session_recovering", "recovering"),
"ros_receiver_unhealthy": ("control_session_recovering", "recovering"),
"ros-unhealthy": ("control_session_recovering", "recovering"),
"network_or_robot_unreachable": ("network_or_robot_unreachable", "recovering"),
"network-recovered-ros-unhealthy": ("control_session_recovering", "recovering"),
"network-recovered-escalated": ("network_or_robot_unreachable", "recovering"),
}
def _utc_from_epoch(epoch_seconds: float | None) -> str | None:
@@ -47,6 +66,19 @@ def _coerce_float(value: Any, default: float = 0.0) -> float:
return default
def _load_optional_json(path: Path) -> dict[str, Any] | None:
try:
if not path.exists():
return None
with path.open("r", encoding="utf-8") as file:
payload = json.load(file)
if isinstance(payload, dict):
return payload
except Exception:
return None
return None
class GpsDataService:
def __init__(self, receiver: OmniSocketVideoReceiver) -> None:
self._receiver = receiver
@@ -523,6 +555,80 @@ class NetworkTelemetryService:
return session
return None
def _derive_robot_health(
self,
*,
video_receiver_status: dict[str, Any],
local_control_registered: bool,
remote_control_fresh: bool,
remote_video_fresh: bool,
telemetry_state: dict[str, Any],
watchdog_status: dict[str, Any] | None,
) -> dict[str, Any]:
if watchdog_status is not None:
explicit_health = self._derive_robot_health_from_watchdog(watchdog_status)
if explicit_health is not None:
return explicit_health
has_recent_frame = bool(video_receiver_status.get("has_recent_frame"))
telemetry_connected = bool(telemetry_state.get("connected"))
telemetry_stale = bool(telemetry_state.get("stale", True))
if has_recent_frame and remote_control_fresh and remote_video_fresh:
fault_reason = "none"
recovery_state = "ok"
elif not remote_control_fresh and not remote_video_fresh and not has_recent_frame:
fault_reason = "network_or_robot_unreachable"
recovery_state = "recovering" if telemetry_connected and not telemetry_stale else "degraded"
elif remote_control_fresh and not remote_video_fresh:
fault_reason = "video_session_recovering"
recovery_state = "recovering"
elif not remote_control_fresh and local_control_registered:
fault_reason = "control_session_recovering"
recovery_state = "recovering"
elif remote_control_fresh and not has_recent_frame:
fault_reason = "video_pipeline_stalled"
recovery_state = "degraded"
else:
fault_reason = "unknown"
recovery_state = "degraded"
return {
"fault_reason": fault_reason,
"recovery_state": recovery_state,
"confidence": "derived",
"updated_at": utc_iso_now(),
}
def _derive_robot_health_from_watchdog(self, watchdog_status: dict[str, Any]) -> dict[str, Any] | None:
updated_at_epoch_ms = _coerce_int(watchdog_status.get("updated_at_epoch_ms"))
if updated_at_epoch_ms <= 0:
return None
now_epoch_ms = int(time.time() * 1000)
if now_epoch_ms - updated_at_epoch_ms > WATCHDOG_STATUS_STALE_MS:
return None
raw_fault_reason = str(watchdog_status.get("fault_reason", "") or "")
raw_recovery_state = str(watchdog_status.get("recovery_state", "") or "")
normalized_fault_reason = "unknown"
normalized_recovery_state = raw_recovery_state or "degraded"
mapped_health = WATCHDOG_FAULT_REASON_MAP.get(raw_fault_reason)
if mapped_health is not None:
normalized_fault_reason, recovery_override = mapped_health
if recovery_override is not None:
normalized_recovery_state = recovery_override
if raw_recovery_state == "backoff":
normalized_recovery_state = "backoff"
return {
"fault_reason": normalized_fault_reason,
"recovery_state": normalized_recovery_state,
"confidence": "derived",
"updated_at": _utc_from_epoch(updated_at_epoch_ms / 1000.0) or utc_iso_now(),
}
def get_latest(self) -> dict[str, Any]:
self._ensure_started()
@@ -535,6 +641,7 @@ class NetworkTelemetryService:
control_app = self._control_sender.session_stats()
video_kcp = self._video_receiver.session_kcp_stats()
control_kcp = self._control_sender.session_kcp_stats()
video_receiver_status = self._video_receiver.get_status()
arbiter_status = self._control_arbiter.get_status()
ingress_status = self._native_ingress.get_status()
sender_status = self._control_sender.get_status()
@@ -607,6 +714,16 @@ class NetworkTelemetryService:
jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None
local_control_registered = bool(control_app.get("registered", 0))
remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale"))
remote_video_fresh = bool(remote_sessions["video"].get("connected")) and not bool(remote_sessions["video"].get("stale"))
watchdog_status = _load_optional_json(WATCHDOG_STATUS_PATH)
robot_health = self._derive_robot_health(
video_receiver_status=video_receiver_status,
local_control_registered=local_control_registered,
remote_control_fresh=remote_control_fresh,
remote_video_fresh=remote_video_fresh,
telemetry_state=telemetry_state,
watchdog_status=watchdog_status,
)
if local_control_registered and remote_control_fresh:
peer_status = "online"
@@ -657,6 +774,7 @@ class NetworkTelemetryService:
"last_server_error": str(telemetry_state.get("last_server_error", "") or ""),
"reconnect_count": int(telemetry_state.get("reconnect_count", 0)),
},
"robot_health": robot_health,
"ingress": {
"native_udp": ingress_status,
},

View File

@@ -40,6 +40,8 @@ class FrameTrailerMetadata:
timestamp_ns: int
latitude: float
longitude: float
raw_latitude_hex: str
raw_longitude_hex: str
received_at: float

View File

@@ -84,6 +84,10 @@ function legSessions(link: LinkTelemetry | null): Array<{ name: string; data: Li
</div>
<div class="summary telemetry-strip">
<p><strong>Robot Fault:</strong> {{ network?.robot_health?.fault_reason ?? 'n/a' }}</p>
<p><strong>Recovery State:</strong> {{ network?.robot_health?.recovery_state ?? 'n/a' }}</p>
<p><strong>Health Confidence:</strong> {{ network?.robot_health?.confidence ?? 'n/a' }}</p>
<p><strong>Health Updated:</strong> {{ formatTime(network?.robot_health?.updated_at) }}</p>
<p><strong>Transport:</strong> {{ network?.transport ?? 'n/a' }} / {{ network?.source_mode ?? 'n/a' }}</p>
<p><strong>Telemetry Peer:</strong> {{ network?.telemetry_receiver?.peer_id ?? 'n/a' }}</p>
<p><strong>Telemetry Registered:</strong> {{ network?.telemetry_receiver?.registered ? 'yes' : 'no' }}</p>

View File

@@ -3,6 +3,8 @@ export interface GpsTelemetry {
utc_time: string
latitude: number | null
longitude: number | null
raw_latitude_hex?: string
raw_longitude_hex?: string
satellites: number | null
altitude_m: number | null
coordinate_system: string
@@ -142,6 +144,13 @@ export interface TelemetryReceiverStatus {
reconnect_count: number
}
export interface RobotHealthStatus {
fault_reason: string
recovery_state: string
confidence: string
updated_at: string
}
export interface NetworkTelemetry {
peer_status: string
latency_ms: number | null
@@ -170,6 +179,7 @@ export interface NetworkTelemetry {
d_to_b: LinkTelemetry
}
telemetry_receiver: TelemetryReceiverStatus
robot_health: RobotHealthStatus
ingress: {
native_udp: NativeUdpIngress
}