|
|
|
|
@@ -2,10 +2,12 @@ from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
from collections import deque
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import threading
|
|
|
|
|
import time
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
from .common import (
|
|
|
|
|
@@ -21,6 +23,23 @@ from .video import FrameTrailerMetadata, OmniSocketVideoReceiver
|
|
|
|
|
LOCAL_SAMPLE_INTERVAL_MS = 500
|
|
|
|
|
TREND_HISTORY_SIZE = 10
|
|
|
|
|
TREND_WINDOW_SIZE = 5
|
|
|
|
|
BLITZ_RUNTIME_DIR = Path(os.getenv("BLITZ_RUNTIME_DIR", "/run/blitz-robot"))
|
|
|
|
|
WATCHDOG_STATUS_PATH = BLITZ_RUNTIME_DIR / "watchdog.status.json"
|
|
|
|
|
WATCHDOG_STATUS_STALE_MS = max(int(os.getenv("BLITZ_HEALTH_STALE_SEC", "15")), 1) * 1000
|
|
|
|
|
WATCHDOG_FAULT_REASON_MAP: dict[str, tuple[str, str | None]] = {
|
|
|
|
|
"": ("none", None),
|
|
|
|
|
"none": ("none", None),
|
|
|
|
|
"camera_missing": ("video_pipeline_stalled", "degraded"),
|
|
|
|
|
"camera_recovered": ("video_session_recovering", "recovering"),
|
|
|
|
|
"camera-reappeared-escalated": ("video_session_recovering", "recovering"),
|
|
|
|
|
"bside_status_stale": ("video_session_recovering", "recovering"),
|
|
|
|
|
"bside-unhealthy-escalated": ("video_session_recovering", "recovering"),
|
|
|
|
|
"ros_receiver_unhealthy": ("control_session_recovering", "recovering"),
|
|
|
|
|
"ros-unhealthy": ("control_session_recovering", "recovering"),
|
|
|
|
|
"network_or_robot_unreachable": ("network_or_robot_unreachable", "recovering"),
|
|
|
|
|
"network-recovered-ros-unhealthy": ("control_session_recovering", "recovering"),
|
|
|
|
|
"network-recovered-escalated": ("network_or_robot_unreachable", "recovering"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _utc_from_epoch(epoch_seconds: float | None) -> str | None:
|
|
|
|
|
@@ -47,6 +66,19 @@ def _coerce_float(value: Any, default: float = 0.0) -> float:
|
|
|
|
|
return default
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_optional_json(path: Path) -> dict[str, Any] | None:
|
|
|
|
|
try:
|
|
|
|
|
if not path.exists():
|
|
|
|
|
return None
|
|
|
|
|
with path.open("r", encoding="utf-8") as file:
|
|
|
|
|
payload = json.load(file)
|
|
|
|
|
if isinstance(payload, dict):
|
|
|
|
|
return payload
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GpsDataService:
|
|
|
|
|
def __init__(self, receiver: OmniSocketVideoReceiver) -> None:
|
|
|
|
|
self._receiver = receiver
|
|
|
|
|
@@ -523,6 +555,80 @@ class NetworkTelemetryService:
|
|
|
|
|
return session
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def _derive_robot_health(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
video_receiver_status: dict[str, Any],
|
|
|
|
|
local_control_registered: bool,
|
|
|
|
|
remote_control_fresh: bool,
|
|
|
|
|
remote_video_fresh: bool,
|
|
|
|
|
telemetry_state: dict[str, Any],
|
|
|
|
|
watchdog_status: dict[str, Any] | None,
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
if watchdog_status is not None:
|
|
|
|
|
explicit_health = self._derive_robot_health_from_watchdog(watchdog_status)
|
|
|
|
|
if explicit_health is not None:
|
|
|
|
|
return explicit_health
|
|
|
|
|
|
|
|
|
|
has_recent_frame = bool(video_receiver_status.get("has_recent_frame"))
|
|
|
|
|
telemetry_connected = bool(telemetry_state.get("connected"))
|
|
|
|
|
telemetry_stale = bool(telemetry_state.get("stale", True))
|
|
|
|
|
|
|
|
|
|
if has_recent_frame and remote_control_fresh and remote_video_fresh:
|
|
|
|
|
fault_reason = "none"
|
|
|
|
|
recovery_state = "ok"
|
|
|
|
|
elif not remote_control_fresh and not remote_video_fresh and not has_recent_frame:
|
|
|
|
|
fault_reason = "network_or_robot_unreachable"
|
|
|
|
|
recovery_state = "recovering" if telemetry_connected and not telemetry_stale else "degraded"
|
|
|
|
|
elif remote_control_fresh and not remote_video_fresh:
|
|
|
|
|
fault_reason = "video_session_recovering"
|
|
|
|
|
recovery_state = "recovering"
|
|
|
|
|
elif not remote_control_fresh and local_control_registered:
|
|
|
|
|
fault_reason = "control_session_recovering"
|
|
|
|
|
recovery_state = "recovering"
|
|
|
|
|
elif remote_control_fresh and not has_recent_frame:
|
|
|
|
|
fault_reason = "video_pipeline_stalled"
|
|
|
|
|
recovery_state = "degraded"
|
|
|
|
|
else:
|
|
|
|
|
fault_reason = "unknown"
|
|
|
|
|
recovery_state = "degraded"
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"fault_reason": fault_reason,
|
|
|
|
|
"recovery_state": recovery_state,
|
|
|
|
|
"confidence": "derived",
|
|
|
|
|
"updated_at": utc_iso_now(),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _derive_robot_health_from_watchdog(self, watchdog_status: dict[str, Any]) -> dict[str, Any] | None:
|
|
|
|
|
updated_at_epoch_ms = _coerce_int(watchdog_status.get("updated_at_epoch_ms"))
|
|
|
|
|
if updated_at_epoch_ms <= 0:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
now_epoch_ms = int(time.time() * 1000)
|
|
|
|
|
if now_epoch_ms - updated_at_epoch_ms > WATCHDOG_STATUS_STALE_MS:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
raw_fault_reason = str(watchdog_status.get("fault_reason", "") or "")
|
|
|
|
|
raw_recovery_state = str(watchdog_status.get("recovery_state", "") or "")
|
|
|
|
|
normalized_fault_reason = "unknown"
|
|
|
|
|
normalized_recovery_state = raw_recovery_state or "degraded"
|
|
|
|
|
mapped_health = WATCHDOG_FAULT_REASON_MAP.get(raw_fault_reason)
|
|
|
|
|
|
|
|
|
|
if mapped_health is not None:
|
|
|
|
|
normalized_fault_reason, recovery_override = mapped_health
|
|
|
|
|
if recovery_override is not None:
|
|
|
|
|
normalized_recovery_state = recovery_override
|
|
|
|
|
if raw_recovery_state == "backoff":
|
|
|
|
|
normalized_recovery_state = "backoff"
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"fault_reason": normalized_fault_reason,
|
|
|
|
|
"recovery_state": normalized_recovery_state,
|
|
|
|
|
"confidence": "derived",
|
|
|
|
|
"updated_at": _utc_from_epoch(updated_at_epoch_ms / 1000.0) or utc_iso_now(),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def get_latest(self) -> dict[str, Any]:
|
|
|
|
|
self._ensure_started()
|
|
|
|
|
|
|
|
|
|
@@ -535,6 +641,7 @@ class NetworkTelemetryService:
|
|
|
|
|
control_app = self._control_sender.session_stats()
|
|
|
|
|
video_kcp = self._video_receiver.session_kcp_stats()
|
|
|
|
|
control_kcp = self._control_sender.session_kcp_stats()
|
|
|
|
|
video_receiver_status = self._video_receiver.get_status()
|
|
|
|
|
arbiter_status = self._control_arbiter.get_status()
|
|
|
|
|
ingress_status = self._native_ingress.get_status()
|
|
|
|
|
sender_status = self._control_sender.get_status()
|
|
|
|
|
@@ -607,6 +714,16 @@ class NetworkTelemetryService:
|
|
|
|
|
jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None
|
|
|
|
|
local_control_registered = bool(control_app.get("registered", 0))
|
|
|
|
|
remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale"))
|
|
|
|
|
remote_video_fresh = bool(remote_sessions["video"].get("connected")) and not bool(remote_sessions["video"].get("stale"))
|
|
|
|
|
watchdog_status = _load_optional_json(WATCHDOG_STATUS_PATH)
|
|
|
|
|
robot_health = self._derive_robot_health(
|
|
|
|
|
video_receiver_status=video_receiver_status,
|
|
|
|
|
local_control_registered=local_control_registered,
|
|
|
|
|
remote_control_fresh=remote_control_fresh,
|
|
|
|
|
remote_video_fresh=remote_video_fresh,
|
|
|
|
|
telemetry_state=telemetry_state,
|
|
|
|
|
watchdog_status=watchdog_status,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if local_control_registered and remote_control_fresh:
|
|
|
|
|
peer_status = "online"
|
|
|
|
|
@@ -657,6 +774,7 @@ class NetworkTelemetryService:
|
|
|
|
|
"last_server_error": str(telemetry_state.get("last_server_error", "") or ""),
|
|
|
|
|
"reconnect_count": int(telemetry_state.get("reconnect_count", 0)),
|
|
|
|
|
},
|
|
|
|
|
"robot_health": robot_health,
|
|
|
|
|
"ingress": {
|
|
|
|
|
"native_udp": ingress_status,
|
|
|
|
|
},
|
|
|
|
|
|