feat: 长保持连接,控制端可重启

This commit is contained in:
Mock
2026-04-10 11:10:54 +08:00
parent 08057baf0c
commit adb43efb12
6 changed files with 190 additions and 23 deletions

View File

@@ -214,6 +214,9 @@ class HubTelemetryReceiver:
self._last_error = ""
self._last_received_wall = 0.0
self._last_received_monotonic = 0.0
self._reconnect_count = 0
self._ever_connected = False
self._closing = threading.Event()
self._load_backend()
def _load_backend(self) -> None:
@@ -259,7 +262,7 @@ class HubTelemetryReceiver:
return
with self._lock:
if self._started:
if self._started or self._closing.is_set():
return
self._started = True
self._thread = threading.Thread(
@@ -270,14 +273,18 @@ class HubTelemetryReceiver:
self._thread.start()
def _run(self) -> None:
while True:
while not self._closing.is_set():
try:
session = self._connect_session()
with self._lock:
self._session = session
self._last_error = ""
if self._ever_connected:
self._reconnect_count += 1
else:
self._ever_connected = True
while True:
while not self._closing.is_set():
result = session.recv(timeout_ms=1000)
if result is None:
continue
@@ -302,18 +309,28 @@ class HubTelemetryReceiver:
self._last_received_monotonic = now_mono
self._last_error = ""
except Exception as error: # pragma: no cover - runtime integration path
with self._lock:
self._last_error = str(error)
if not self._closing.is_set():
session_error = ""
if self._session is not None:
try:
session_error = str(dict(self._session.stats()).get("last_server_error", "") or "")
except Exception:
session_error = ""
with self._lock:
self._last_error = session_error or str(error)
finally:
with self._lock:
session = self._session
self._session = None
if self._closing.is_set():
self._started = False
if session is not None:
try:
session.close()
except Exception:
pass
time.sleep(2)
if not self._closing.is_set():
time.sleep(2)
def get_snapshot(self) -> dict[str, Any]:
self.ensure_started()
@@ -326,6 +343,14 @@ class HubTelemetryReceiver:
snapshot = self._latest_snapshot
connected = self._session is not None
last_error = self._last_error
reconnect_count = self._reconnect_count
if self._session is not None:
try:
session_stats = dict(self._session.stats())
except Exception:
session_stats = {}
else:
session_stats = {}
stale = True
if received_monotonic > 0.0:
@@ -339,8 +364,24 @@ class HubTelemetryReceiver:
"peer_id": str(cfg.get("peer_id", "peer-a-telemetry")),
"snapshot": snapshot or {"sessions": []},
"last_error": last_error,
"registered": bool(session_stats.get("registered", 0)),
"last_server_error": str(session_stats.get("last_server_error", "") or ""),
"reconnect_count": reconnect_count,
}
def close(self) -> None:
self._closing.set()
with self._lock:
session = self._session
if session is not None:
try:
session.close()
except Exception:
pass
thread = self._thread
if thread is not None and thread.is_alive():
thread.join(timeout=0.5)
class NetworkTelemetryService:
def __init__(
@@ -362,6 +403,7 @@ class NetworkTelemetryService:
self._sample_thread: threading.Thread | None = None
self._sample_started = False
self._last_remote_snapshot_at = 0.0
self._closing = threading.Event()
def _ensure_started(self) -> None:
self._video_receiver.ensure_started()
@@ -369,7 +411,7 @@ class NetworkTelemetryService:
self._native_ingress.ensure_started()
self._hub_receiver.ensure_started()
with self._rate_lock:
if self._sample_started:
if self._sample_started or self._closing.is_set():
return
self._sample_started = True
self._sample_thread = threading.Thread(
@@ -381,7 +423,7 @@ class NetworkTelemetryService:
def _sample_loop(self) -> None:
interval_seconds = LOCAL_SAMPLE_INTERVAL_MS / 1000.0
while True:
while not self._closing.is_set():
try:
self._trend_tracker.add_sample("a_to_d.video", self._video_receiver.session_kcp_stats())
self._trend_tracker.add_sample("a_to_d.control", self._control_sender.session_kcp_stats())
@@ -431,9 +473,12 @@ class NetworkTelemetryService:
stale: bool,
) -> dict[str, Any]:
described = self._trend_tracker.describe(trend_key, current_kcp)
connected = bool(described["kcp"].get("connected"))
if app_stats is not None and "registered" in app_stats:
connected = bool(app_stats.get("registered"))
return {
"peer_id": peer_id,
"connected": bool(described["kcp"].get("connected")),
"connected": connected,
"updated_at": updated_at,
"stale": stale,
"app": app_stats,
@@ -561,9 +606,13 @@ class NetworkTelemetryService:
)
latency_ms = primary_kcp.get("srtt_ms") if primary_session is not None else None
jitter_ms = primary_kcp.get("srttvar_ms") if primary_session is not None else None
local_control_registered = bool(control_app.get("registered", 0))
remote_control_fresh = bool(remote_sessions["control"].get("connected")) and not bool(remote_sessions["control"].get("stale"))
if fresh_connected_sessions > 0:
if local_control_registered and remote_control_fresh:
peer_status = "online"
elif local_control_registered or bool(local_sessions["video"].get("connected")):
peer_status = "degraded"
elif sender_status.get("backend_ready"):
peer_status = "idle"
else:
@@ -605,6 +654,9 @@ class NetworkTelemetryService:
"hub_stale": remote_stale,
"last_error": telemetry_state.get("last_error", ""),
"peer_id": telemetry_state.get("peer_id", ""),
"registered": bool(telemetry_state.get("registered", False)),
"last_server_error": str(telemetry_state.get("last_server_error", "") or ""),
"reconnect_count": int(telemetry_state.get("reconnect_count", 0)),
},
"ingress": {
"native_udp": ingress_status,
@@ -614,3 +666,9 @@ class NetworkTelemetryService:
"sender": sender_status,
},
}
def close(self) -> None:
self._closing.set()
thread = self._sample_thread
if thread is not None and thread.is_alive():
thread.join(timeout=0.5)