feat: 显示机器人故障原因
This commit is contained in:
@@ -2,10 +2,12 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .common import (
|
from .common import (
|
||||||
@@ -21,6 +23,23 @@ from .video import FrameTrailerMetadata, OmniSocketVideoReceiver
|
|||||||
LOCAL_SAMPLE_INTERVAL_MS = 500
|
LOCAL_SAMPLE_INTERVAL_MS = 500
|
||||||
TREND_HISTORY_SIZE = 10
|
TREND_HISTORY_SIZE = 10
|
||||||
TREND_WINDOW_SIZE = 5
|
TREND_WINDOW_SIZE = 5
|
||||||
|
BLITZ_RUNTIME_DIR = Path(os.getenv("BLITZ_RUNTIME_DIR", "/run/blitz-robot"))
|
||||||
|
WATCHDOG_STATUS_PATH = BLITZ_RUNTIME_DIR / "watchdog.status.json"
|
||||||
|
WATCHDOG_STATUS_STALE_MS = max(int(os.getenv("BLITZ_HEALTH_STALE_SEC", "15")), 1) * 1000
|
||||||
|
WATCHDOG_FAULT_REASON_MAP: dict[str, tuple[str, str | None]] = {
|
||||||
|
"": ("none", None),
|
||||||
|
"none": ("none", None),
|
||||||
|
"camera_missing": ("video_pipeline_stalled", "degraded"),
|
||||||
|
"camera_recovered": ("video_session_recovering", "recovering"),
|
||||||
|
"camera-reappeared-escalated": ("video_session_recovering", "recovering"),
|
||||||
|
"bside_status_stale": ("video_session_recovering", "recovering"),
|
||||||
|
"bside-unhealthy-escalated": ("video_session_recovering", "recovering"),
|
||||||
|
"ros_receiver_unhealthy": ("control_session_recovering", "recovering"),
|
||||||
|
"ros-unhealthy": ("control_session_recovering", "recovering"),
|
||||||
|
"network_or_robot_unreachable": ("network_or_robot_unreachable", "recovering"),
|
||||||
|
"network-recovered-ros-unhealthy": ("control_session_recovering", "recovering"),
|
||||||
|
"network-recovered-escalated": ("network_or_robot_unreachable", "recovering"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _utc_from_epoch(epoch_seconds: float | None) -> str | None:
|
def _utc_from_epoch(epoch_seconds: float | None) -> str | None:
|
||||||
@@ -47,6 +66,19 @@ def _coerce_float(value: Any, default: float = 0.0) -> float:
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _load_optional_json(path: Path) -> dict[str, Any] | None:
|
||||||
|
try:
|
||||||
|
if not path.exists():
|
||||||
|
return None
|
||||||
|
with path.open("r", encoding="utf-8") as file:
|
||||||
|
payload = json.load(file)
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
return payload
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class GpsDataService:
|
class GpsDataService:
|
||||||
def __init__(self, receiver: OmniSocketVideoReceiver) -> None:
|
def __init__(self, receiver: OmniSocketVideoReceiver) -> None:
|
||||||
self._receiver = receiver
|
self._receiver = receiver
|
||||||
@@ -523,6 +555,80 @@ class NetworkTelemetryService:
|
|||||||
return session
|
return session
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _derive_robot_health(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
video_receiver_status: dict[str, Any],
|
||||||
|
local_control_registered: bool,
|
||||||
|
remote_control_fresh: bool,
|
||||||
|
remote_video_fresh: bool,
|
||||||
|
telemetry_state: dict[str, Any],
|
||||||
|
watchdog_status: dict[str, Any] | None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
if watchdog_status is not None:
|
||||||
|
explicit_health = self._derive_robot_health_from_watchdog(watchdog_status)
|
||||||
|
if explicit_health is not None:
|
||||||
|
return explicit_health
|
||||||
|
|
||||||
|
has_recent_frame = bool(video_receiver_status.get("has_recent_frame"))
|
||||||
|
telemetry_connected = bool(telemetry_state.get("connected"))
|
||||||
|
telemetry_stale = bool(telemetry_state.get("stale", True))
|
||||||
|
|
||||||
|
if has_recent_frame and remote_control_fresh and remote_video_fresh:
|
||||||
|
fault_reason = "none"
|
||||||
|
recovery_state = "ok"
|
||||||
|
elif not remote_control_fresh and not remote_video_fresh and not has_recent_frame:
|
||||||
|
fault_reason = "network_or_robot_unreachable"
|
||||||
|
recovery_state = "recovering" if telemetry_connected and not telemetry_stale else "degraded"
|
||||||
|
elif remote_control_fresh and not remote_video_fresh:
|
||||||
|
fault_reason = "video_session_recovering"
|
||||||
|
recovery_state = "recovering"
|
||||||
|
elif not remote_control_fresh and local_control_registered:
|
||||||
|
fault_reason = "control_session_recovering"
|
||||||
|
recovery_state = "recovering"
|
||||||
|
elif remote_control_fresh and not has_recent_frame:
|
||||||
|
fault_reason = "video_pipeline_stalled"
|
||||||
|
recovery_state = "degraded"
|
||||||
|
else:
|
||||||
|
fault_reason = "unknown"
|
||||||
|
recovery_state = "degraded"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"fault_reason": fault_reason,
|
||||||
|
"recovery_state": recovery_state,
|
||||||
|
"confidence": "derived",
|
||||||
|
"updated_at": utc_iso_now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _derive_robot_health_from_watchdog(self, watchdog_status: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
updated_at_epoch_ms = _coerce_int(watchdog_status.get("updated_at_epoch_ms"))
|
||||||
|
if updated_at_epoch_ms <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
now_epoch_ms = int(time.time() * 1000)
|
||||||
|
if now_epoch_ms - updated_at_epoch_ms > WATCHDOG_STATUS_STALE_MS:
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_fault_reason = str(watchdog_status.get("fault_reason", "") or "")
|
||||||
|
raw_recovery_state = str(watchdog_status.get("recovery_state", "") or "")
|
||||||
|
normalized_fault_reason = "unknown"
|
||||||
|
normalized_recovery_state = raw_recovery_state or "degraded"
|
||||||
|
mapped_health = WATCHDOG_FAULT_REASON_MAP.get(raw_fault_reason)
|
||||||
|
|
||||||
|
if mapped_health is not None:
|
||||||
|
normalized_fault_reason, recovery_override = mapped_health
|
||||||
|
if recovery_override is not None:
|
||||||
|
normalized_recovery_state = recovery_override
|
||||||
|
if raw_recovery_state == "backoff":
|
||||||
|
normalized_recovery_state = "backoff"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"fault_reason": normalized_fault_reason,
|
||||||
|
"recovery_state": normalized_recovery_state,
|
||||||
|
"confidence": "derived",
|
||||||
|
"updated_at": _utc_from_epoch(updated_at_epoch_ms / 1000.0) or utc_iso_now(),
|
||||||
|
}
|
||||||
|
|
||||||
def get_latest(self) -> dict[str, Any]:
|
def get_latest(self) -> dict[str, Any]:
|
||||||
self._ensure_started()
|
self._ensure_started()
|
||||||
|
|
||||||
@@ -535,6 +641,7 @@ class NetworkTelemetryService:
|
|||||||
control_app = self._control_sender.session_stats()
|
control_app = self._control_sender.session_stats()
|
||||||
video_kcp = self._video_receiver.session_kcp_stats()
|
video_kcp = self._video_receiver.session_kcp_stats()
|
||||||
control_kcp = self._control_sender.session_kcp_stats()
|
control_kcp = self._control_sender.session_kcp_stats()
|
||||||
|
video_receiver_status = self._video_receiver.get_status()
|
||||||
arbiter_status = self._control_arbiter.get_status()
|
arbiter_status = self._control_arbiter.get_status()
|
||||||
ingress_status = self._native_ingress.get_status()
|
ingress_status = self._native_ingress.get_status()
|
||||||
sender_status = self._control_sender.get_status()
|
sender_status = self._control_sender.get_status()
|
||||||
@@ -607,6 +714,16 @@ class NetworkTelemetryService:
|
|||||||
jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None
|
jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None
|
||||||
local_control_registered = bool(control_app.get("registered", 0))
|
local_control_registered = bool(control_app.get("registered", 0))
|
||||||
remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale"))
|
remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale"))
|
||||||
|
remote_video_fresh = bool(remote_sessions["video"].get("connected")) and not bool(remote_sessions["video"].get("stale"))
|
||||||
|
watchdog_status = _load_optional_json(WATCHDOG_STATUS_PATH)
|
||||||
|
robot_health = self._derive_robot_health(
|
||||||
|
video_receiver_status=video_receiver_status,
|
||||||
|
local_control_registered=local_control_registered,
|
||||||
|
remote_control_fresh=remote_control_fresh,
|
||||||
|
remote_video_fresh=remote_video_fresh,
|
||||||
|
telemetry_state=telemetry_state,
|
||||||
|
watchdog_status=watchdog_status,
|
||||||
|
)
|
||||||
|
|
||||||
if local_control_registered and remote_control_fresh:
|
if local_control_registered and remote_control_fresh:
|
||||||
peer_status = "online"
|
peer_status = "online"
|
||||||
@@ -657,6 +774,7 @@ class NetworkTelemetryService:
|
|||||||
"last_server_error": str(telemetry_state.get("last_server_error", "") or ""),
|
"last_server_error": str(telemetry_state.get("last_server_error", "") or ""),
|
||||||
"reconnect_count": int(telemetry_state.get("reconnect_count", 0)),
|
"reconnect_count": int(telemetry_state.get("reconnect_count", 0)),
|
||||||
},
|
},
|
||||||
|
"robot_health": robot_health,
|
||||||
"ingress": {
|
"ingress": {
|
||||||
"native_udp": ingress_status,
|
"native_udp": ingress_status,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -40,6 +40,8 @@ class FrameTrailerMetadata:
|
|||||||
timestamp_ns: int
|
timestamp_ns: int
|
||||||
latitude: float
|
latitude: float
|
||||||
longitude: float
|
longitude: float
|
||||||
|
raw_latitude_hex: str
|
||||||
|
raw_longitude_hex: str
|
||||||
received_at: float
|
received_at: float
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -84,6 +84,10 @@ function legSessions(link: LinkTelemetry | null): Array<{ name: string; data: Li
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="summary telemetry-strip">
|
<div class="summary telemetry-strip">
|
||||||
|
<p><strong>Robot Fault:</strong> {{ network?.robot_health?.fault_reason ?? 'n/a' }}</p>
|
||||||
|
<p><strong>Recovery State:</strong> {{ network?.robot_health?.recovery_state ?? 'n/a' }}</p>
|
||||||
|
<p><strong>Health Confidence:</strong> {{ network?.robot_health?.confidence ?? 'n/a' }}</p>
|
||||||
|
<p><strong>Health Updated:</strong> {{ formatTime(network?.robot_health?.updated_at) }}</p>
|
||||||
<p><strong>Transport:</strong> {{ network?.transport ?? 'n/a' }} / {{ network?.source_mode ?? 'n/a' }}</p>
|
<p><strong>Transport:</strong> {{ network?.transport ?? 'n/a' }} / {{ network?.source_mode ?? 'n/a' }}</p>
|
||||||
<p><strong>Telemetry Peer:</strong> {{ network?.telemetry_receiver?.peer_id ?? 'n/a' }}</p>
|
<p><strong>Telemetry Peer:</strong> {{ network?.telemetry_receiver?.peer_id ?? 'n/a' }}</p>
|
||||||
<p><strong>Telemetry Registered:</strong> {{ network?.telemetry_receiver?.registered ? 'yes' : 'no' }}</p>
|
<p><strong>Telemetry Registered:</strong> {{ network?.telemetry_receiver?.registered ? 'yes' : 'no' }}</p>
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ export interface GpsTelemetry {
|
|||||||
utc_time: string
|
utc_time: string
|
||||||
latitude: number | null
|
latitude: number | null
|
||||||
longitude: number | null
|
longitude: number | null
|
||||||
|
raw_latitude_hex?: string
|
||||||
|
raw_longitude_hex?: string
|
||||||
satellites: number | null
|
satellites: number | null
|
||||||
altitude_m: number | null
|
altitude_m: number | null
|
||||||
coordinate_system: string
|
coordinate_system: string
|
||||||
@@ -142,6 +144,13 @@ export interface TelemetryReceiverStatus {
|
|||||||
reconnect_count: number
|
reconnect_count: number
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface RobotHealthStatus {
|
||||||
|
fault_reason: string
|
||||||
|
recovery_state: string
|
||||||
|
confidence: string
|
||||||
|
updated_at: string
|
||||||
|
}
|
||||||
|
|
||||||
export interface NetworkTelemetry {
|
export interface NetworkTelemetry {
|
||||||
peer_status: string
|
peer_status: string
|
||||||
latency_ms: number | null
|
latency_ms: number | null
|
||||||
@@ -170,6 +179,7 @@ export interface NetworkTelemetry {
|
|||||||
d_to_b: LinkTelemetry
|
d_to_b: LinkTelemetry
|
||||||
}
|
}
|
||||||
telemetry_receiver: TelemetryReceiverStatus
|
telemetry_receiver: TelemetryReceiverStatus
|
||||||
|
robot_health: RobotHealthStatus
|
||||||
ingress: {
|
ingress: {
|
||||||
native_udp: NativeUdpIngress
|
native_udp: NativeUdpIngress
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user