Files
OmniSocketGo/scripts/boot/blitz-watchdog.sh
2026-04-14 20:52:41 +08:00

972 lines
30 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="watchdog"
B_SIDE_SERVICE="blitz-b-side-omnid.service"
ROS_SERVICE="blitz-ros-receiver.service"
B_SIDE_STATUS_FILE=""
ROS_STATUS_FILE=""
WATCHDOG_STATUS_FILE=""
NETWORK_FAULT_FILE=""
WATCHDOG_EVENT_LOG=""
WATCHDOG_SAMPLE_LOG=""
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
CAMERA_MISSING_PREV=0
CAMERA_RECOVERY_STABLE_COUNT=0
NETWORK_FAIL_COUNT=0
NETWORK_COOLDOWN_UNTIL=0
BACKOFF_UNTIL=0
LAST_ACTION="none"
LAST_ACTION_EPOCH_MS=0
FULL_RESTART_WINDOW_START=0
FULL_RESTART_WINDOW_COUNT=0
NETWORK_LAST_INTERFACE=""
NETWORK_ROUTE_INTERFACE_LAST_KNOWN=""
NETWORK_PRIMARY_LAST_RETRY_SEC=0
GPS_LAST_CHECK_SEC=0
GPS_DEVICE_PRESENT_PREV=-1
GPS_DEVICE_PRESENT_STATE=1
GPS_STACK_ACTIVE_STATE=1
LAST_REPORTED_FAULT_REASON=""
LAST_REPORTED_RECOVERY_STATE=""
declare -A TARGETED_RESTART_WINDOW_START=()
declare -A TARGETED_RESTART_WINDOW_COUNT=()
now_epoch_sec() {
date +%s
}
now_epoch_ms() {
date +%s%3N
}
service_is_active() {
systemctl is-active --quiet "$1"
}
gps_monitor_enabled() {
[[ "${BLITZ_GPS_MONITOR_ENABLED:-0}" == "1" ]]
}
gps_stack_active() {
local units=()
local unit
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
if (( ${#units[@]} == 0 )); then
return 1
fi
for unit in "${units[@]}"; do
if service_is_active "${unit}"; then
return 0
fi
done
return 1
}
restart_gps_stack() {
local reason="$1"
local devices="$2"
local units=()
local rc
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
if (( ${#units[@]} == 0 )); then
GPS_STACK_ACTIVE_STATE=0
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=empty" 1
return 1
fi
set_last_action "gps-reconnect"
blitz_log "${STEP}" "gps-reconnect" "start" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
if systemctl restart "${units[@]}"; then
GPS_STACK_ACTIVE_STATE=1
blitz_log "${STEP}" "gps-reconnect" "success" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
return 0
fi
rc=$?
GPS_STACK_ACTIVE_STATE=0
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" "${rc}"
return "${rc}"
}
check_gps_health() {
local now_sec="$1"
local check_interval_sec="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
local device_glob="${BLITZ_GPS_DEVICE_GLOB:-}"
local previous_present="${GPS_DEVICE_PRESENT_PREV}"
local recovery_reason=""
local device_summary=""
local -a devices=()
if ! gps_monitor_enabled; then
GPS_DEVICE_PRESENT_STATE=1
GPS_STACK_ACTIVE_STATE=1
return 0
fi
if (( check_interval_sec < 1 )); then
check_interval_sec=1
fi
if (( GPS_LAST_CHECK_SEC != 0 && now_sec - GPS_LAST_CHECK_SEC < check_interval_sec )); then
if (( GPS_DEVICE_PRESENT_STATE == 1 && GPS_STACK_ACTIVE_STATE == 1 )); then
return 0
fi
return 1
fi
GPS_LAST_CHECK_SEC="${now_sec}"
mapfile -t devices < <(compgen -G "${device_glob}" || true)
if (( ${#devices[@]} == 0 )); then
GPS_DEVICE_PRESENT_STATE=0
GPS_STACK_ACTIVE_STATE=0
if (( previous_present != 0 )); then
blitz_log "${STEP}" "gps-device-check" "failure" "state=missing glob=${device_glob}" 1
fi
GPS_DEVICE_PRESENT_PREV=0
return 1
fi
device_summary="$(IFS=,; printf '%s' "${devices[*]}")"
GPS_DEVICE_PRESENT_STATE=1
GPS_DEVICE_PRESENT_PREV=1
if (( previous_present == 0 )); then
blitz_log "${STEP}" "gps-device-check" "success" "state=reappeared devices=${device_summary}" 0
recovery_reason="device-reappeared"
elif ! gps_stack_active; then
recovery_reason="gpsd-inactive"
fi
if [[ -n "${recovery_reason}" ]]; then
if restart_gps_stack "${recovery_reason}" "${device_summary}"; then
return 0
fi
return 1
fi
GPS_STACK_ACTIVE_STATE=1
return 0
}
status_file_fresh() {
local path="$1"
local max_age_sec="$2"
local now_sec
local mtime_sec
if [[ ! -f "${path}" ]]; then
return 1
fi
now_sec="$(now_epoch_sec)"
mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)"
(( now_sec - mtime_sec <= max_age_sec ))
}
ros_receiver_status_fresh() {
local path="$1"
local max_age_sec="$2"
local now_epoch_ms_value
now_epoch_ms_value="$(now_epoch_ms)"
python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY'
import json
import sys
path = sys.argv[1]
now_epoch_ms = int(sys.argv[2])
max_age_ms = int(sys.argv[3]) * 1000
try:
with open(path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
except Exception:
raise SystemExit(1)
heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0)
socket_bound = bool(payload.get("socket_bound"))
if heartbeat_ms <= 0 or not socket_bound:
raise SystemExit(1)
raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1)
PY
}
ros_receiver_healthy() {
local max_age_sec="$1"
service_is_active "${ROS_SERVICE}" \
&& [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \
&& status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \
&& ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}"
}
write_watchdog_status() {
local fault_reason="$1"
local recovery_state="$2"
local network_ok="$3"
local camera_ok="$4"
local ros_ok="$5"
local bside_ok="$6"
local gps_ok="$7"
local gps_device_present="$8"
local tmp_file
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
cat > "${tmp_file}" <<EOF
{
"updated_at_epoch_ms": $(now_epoch_ms),
"fault_reason": "${fault_reason}",
"recovery_state": "${recovery_state}",
"network_ok": ${network_ok},
"camera_ok": ${camera_ok},
"ros_ok": ${ros_ok},
"bside_ok": ${bside_ok},
"gps_ok": ${gps_ok},
"gps_device_present": ${gps_device_present},
"network_fail_count": ${NETWORK_FAIL_COUNT},
"targeted_restart_count": $(targeted_restart_total),
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
"last_action": "${LAST_ACTION}",
"last_action_epoch_ms": ${LAST_ACTION_EPOCH_MS}
}
EOF
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
}
watchdog_emit_json() {
local record_type="$1"
local action="$2"
local fault_reason="$3"
local recovery_state="$4"
local detail="$5"
local incident_id="${6:-}"
local network_ok="${7:-1}"
local camera_ok="${8:-1}"
local ros_ok="${9:-1}"
local bside_ok="${10:-1}"
local gps_ok="${11:-1}"
local gps_device_present="${12:-1}"
python3 - "${record_type}" "${action}" "${fault_reason}" "${recovery_state}" "${detail}" "${incident_id}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" "${LAST_ACTION}" "${LAST_ACTION_EPOCH_MS}" "${NETWORK_FAIL_COUNT}" "$(targeted_restart_total)" "${FULL_RESTART_WINDOW_COUNT}" <<'PY'
import json
import sys
import time
record_type, action, fault_reason, recovery_state, detail, incident_id, network_ok, camera_ok, ros_ok, bside_ok, gps_ok, gps_device_present, last_action, last_action_epoch_ms, network_fail_count, targeted_restart_count, full_restart_count = sys.argv[1:18]
payload = {
"ts_unix_ms": time.time_ns() // 1_000_000,
"record_type": record_type,
"action": action,
"fault_reason": fault_reason,
"recovery_state": recovery_state,
"detail": detail,
"incident_id": incident_id or None,
"network_ok": network_ok == "1",
"camera_ok": camera_ok == "1",
"ros_ok": ros_ok == "1",
"bside_ok": bside_ok == "1",
"gps_ok": gps_ok == "1",
"gps_device_present": gps_device_present == "1",
"network_fail_count": int(network_fail_count),
"targeted_restart_count": int(targeted_restart_count),
"full_restart_count": int(full_restart_count),
"last_action": last_action,
"last_action_epoch_ms": int(last_action_epoch_ms or 0),
}
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
PY
}
watchdog_append_event() {
local line=""
[[ -n "${WATCHDOG_EVENT_LOG}" ]] || return 0
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
fi
return 0
fi
if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
fi
return 0
fi
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
}
watchdog_append_sample() {
local line=""
[[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
fi
return 0
fi
if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
fi
return 0
fi
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
}
watchdog_record_state_transition() {
local fault_reason="$1"
local recovery_state="$2"
if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then
return 0
fi
watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" ""
LAST_REPORTED_FAULT_REASON="${fault_reason}"
LAST_REPORTED_RECOVERY_STATE="${recovery_state}"
}
watchdog_launch_incident() {
local reason="$1"
local unit_name="$2"
"${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" \
--source watchdog \
--reason "${reason}" \
--unit "${unit_name}" \
--result failure \
--exit-status 1 2>/dev/null || true
}
set_last_action() {
LAST_ACTION="$1"
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
}
targeted_restart_total() {
local total=0
local key
for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do
total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] ))
done
printf '%s\n' "${total}"
}
register_targeted_restart() {
local fault_key="$1"
local now_sec
local window_start
local count
now_sec="$(now_epoch_sec)"
window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}"
count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}"
if (( window_start == 0 || now_sec - window_start > 60 )); then
window_start="${now_sec}"
count=1
else
count=$(( count + 1 ))
fi
TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}"
TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}"
(( count >= 2 ))
}
record_full_restart() {
local now_sec
now_sec="$(now_epoch_sec)"
if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then
FULL_RESTART_WINDOW_START="${now_sec}"
FULL_RESTART_WINDOW_COUNT=1
else
FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 ))
fi
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
BACKOFF_UNTIL=$(( now_sec + 60 ))
watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" ""
fi
}
restart_bside_targeted() {
local fault_key="$1"
local reason="$2"
local rc
local incident_id=""
if register_targeted_restart "${fault_key}"; then
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" ""
full_restart_stack "${reason}-escalated"
return 0
fi
incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")"
set_last_action "restart-bside"
RECOVERY_ACTION_TAKEN=1
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
if systemctl restart "${B_SIDE_SERVICE}"; then
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
return 0
fi
rc=$?
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}"
return "${rc}"
}
full_restart_stack() {
local reason="$1"
local rc
local incident_id=""
incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")"
set_last_action "full-restart"
RECOVERY_ACTION_TAKEN=1
recovery_state="recovering"
fault_reason="${reason}"
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}"
systemctl stop "${B_SIDE_SERVICE}" || true
if systemctl restart "${ROS_SERVICE}"; then
blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0
else
rc=$?
blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
if bash "${BOOT_SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then
:
else
rc=$?
blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
if systemctl start "${B_SIDE_SERVICE}"; then
blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0
else
rc=$?
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}"
record_full_restart
return "${rc}"
fi
watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}"
record_full_restart
}
network_fault_injected() {
[[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]]
}
resolve_network_interface() {
NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)"
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${NETWORK_LAST_INTERFACE}"
return 0
fi
return 1
}
network_route_targets() {
local target
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
fi
for target in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
if [[ -n "${target}" && "${target}" != "${BLITZ_TIME_SERVER_IP:-}" ]]; then
printf '%s\n' "${target}"
fi
done
}
log_target_route_paths() {
local action="$1"
local target
local route_output
while IFS= read -r target; do
[[ -n "${target}" ]] || continue
route_output="$(ip route get "${target}" 2>&1 | head -n 1 || true)"
if [[ -z "${route_output}" ]]; then
route_output="unresolved"
fi
blitz_log "${STEP}" "route-path" "info" "action=${action} target=${target} route=${route_output}" 0
done < <(network_route_targets)
}
route_output_uses_interface() {
local route_output="$1"
local interface_name="$2"
[[ -n "${interface_name}" ]] || return 1
[[ "${route_output}" == *" dev ${interface_name} "* || "${route_output}" == *" dev ${interface_name}" ]]
}
route_output_uses_gateway() {
local route_output="$1"
local gateway="$2"
[[ -n "${gateway}" ]] || return 1
[[ "${route_output}" == *"via ${gateway}"* ]]
}
route_is_desired_target_route() {
local route_output="$1"
local interface_name="$2"
local gateway="$3"
route_output_uses_interface "${route_output}" "${interface_name}" \
&& route_output_uses_gateway "${route_output}" "${gateway}"
}
route_is_managed_5g_route() {
local route_output="$1"
local interface_name="${2:-}"
local gateway="${3:-}"
if route_output_uses_interface "${route_output}" "${interface_name}"; then
return 0
fi
if route_output_uses_gateway "${route_output}" "${gateway}"; then
return 0
fi
if route_output_uses_gateway "${route_output}" "${BLITZ_5G_GATEWAY:-}"; then
return 0
fi
return 1
}
resolve_route_cleanup_interface() {
local interface_name=""
local info_json="${BLITZ_5G_INFO_JSON:-}"
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
printf '%s\n' "${NETWORK_LAST_INTERFACE}"
return 0
fi
if [[ -n "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" ]]; then
printf '%s\n' "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}"
return 0
fi
interface_name="$(blitz_read_5g_info_interface "${info_json}" || true)"
if [[ -n "${interface_name}" ]]; then
printf '%s\n' "${interface_name}"
return 0
fi
return 1
}
resolve_network_gateway() {
local interface_name="$1"
local default_route
local gateway=""
local tokens=()
local index
default_route="$(ip -o route show default dev "${interface_name}" 2>/dev/null | head -n 1 || true)"
if [[ -n "${default_route}" ]]; then
read -r -a tokens <<< "${default_route}"
for (( index=0; index<${#tokens[@]}-1; index++ )); do
if [[ "${tokens[index]}" == "via" ]]; then
gateway="${tokens[index + 1]}"
break
fi
done
fi
if [[ -n "${gateway}" ]]; then
printf '%s\n' "${gateway}"
return 0
fi
if [[ -n "${BLITZ_5G_GATEWAY:-}" ]]; then
printf '%s\n' "${BLITZ_5G_GATEWAY}"
return 0
fi
return 1
}
sync_target_routes_to_5g() {
local interface_name="$1"
local gateway="${2:-}"
local route_output=""
local updated=0
local target
local rc
if [[ -z "${interface_name}" ]]; then
return 1
fi
if [[ -z "${gateway}" ]]; then
gateway="$(resolve_network_gateway "${interface_name}" || true)"
fi
if [[ -z "${gateway}" ]]; then
blitz_log "${STEP}" "route-sync-gateway" "failure" "interface=${interface_name}" 1
return 1
fi
while IFS= read -r target; do
[[ -n "${target}" ]] || continue
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
if [[ -n "${route_output}" ]] && route_is_desired_target_route "${route_output}" "${interface_name}" "${gateway}"; then
continue
fi
if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then
updated=1
blitz_log "${STEP}" "route-sync-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0
else
rc=$?
blitz_log "${STEP}" "route-sync-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}"
return "${rc}"
fi
done < <(network_route_targets)
if (( updated == 1 )); then
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${interface_name}"
log_target_route_paths "sync-to-5g"
fi
return 0
}
clear_target_routes_from_5g() {
local interface_name="${1:-}"
local gateway="${2:-}"
local route_output=""
local target
local removed_any=0
local rc
if [[ -z "${interface_name}" ]]; then
interface_name="$(resolve_route_cleanup_interface || true)"
fi
if [[ -z "${gateway}" && -n "${interface_name}" ]]; then
gateway="$(resolve_network_gateway "${interface_name}" || true)"
fi
if [[ -z "${gateway}" ]]; then
gateway="${BLITZ_5G_GATEWAY:-}"
fi
while IFS= read -r target; do
[[ -n "${target}" ]] || continue
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
if [[ -z "${route_output}" ]] || ! route_is_managed_5g_route "${route_output}" "${interface_name}" "${gateway}"; then
continue
fi
if ip route del "${target}/32"; then
removed_any=1
blitz_log "${STEP}" "route-clear-target" "success" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
else
rc=$?
blitz_log "${STEP}" "route-clear-target" "failure" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" "${rc}"
return "${rc}"
fi
done < <(network_route_targets)
if (( removed_any == 1 )); then
blitz_log "${STEP}" "route-clear" "success" "interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
log_target_route_paths "clear-from-5g"
fi
return 0
}
repair_network_routes() {
local interface_name="$1"
local gateway=""
local route_output
if [[ -z "${interface_name}" ]]; then
return 1
fi
gateway="$(resolve_network_gateway "${interface_name}" || true)"
if [[ -z "${gateway}" ]]; then
blitz_log "${STEP}" "route-repair-gateway" "failure" "interface=${interface_name}" 1
return 1
fi
if ! sync_target_routes_to_5g "${interface_name}" "${gateway}"; then
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)"
if [[ -z "${route_output}" ]]; then
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1
return 1
fi
if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1
return 1
fi
blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0
return 0
}
network_is_healthy() {
local route_output
NETWORK_LAST_INTERFACE=""
if network_fault_injected; then
return 1
fi
if ! resolve_network_interface; then
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)"
if [[ -z "${route_output}" ]]; then
return 1
fi
ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
}
fallback_network_is_healthy() {
local route_output
if [[ -z "${BLITZ_TIME_SERVER_IP:-}" ]]; then
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" || true)"
if [[ -z "${route_output}" ]]; then
return 1
fi
ping -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
}
wait_for_network_recovery() {
local timeout_sec="$1"
local waited=0
while (( waited < timeout_sec )); do
if network_is_healthy; then
blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0
return 0
fi
if (( waited == 0 || waited % 5 == 0 )); then
blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0
fi
sleep 1
waited=$(( waited + 1 ))
done
blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1
return 1
}
perform_network_recovery() {
local rc=0
local incident_id=""
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
set_last_action "route-repair"
RECOVERY_ACTION_TAKEN=1
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
NETWORK_FAIL_COUNT=0
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" ""
return 0
fi
incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")"
set_last_action "network-recovery"
RECOVERY_ACTION_TAKEN=1
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}"
systemctl stop "${B_SIDE_SERVICE}" || true
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
:
else
rc=$?
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}"
return "${rc}"
fi
if wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then
:
else
rc=$?
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}"
return "${rc}"
fi
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
NETWORK_FAIL_COUNT=0
watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}"
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
restart_bside_targeted "network" "network-recovered"
return 0
fi
full_restart_stack "network-recovered-ros-unhealthy"
return 0
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
blitz_require_command stat "${STEP}"
blitz_require_command ping "${STEP}"
blitz_require_command python3 "${STEP}"
blitz_prepare_runtime_dir
blitz_require_run_context
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl"
WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl"
while true; do
fault_reason="none"
recovery_state="ok"
network_ok=1
camera_ok=1
ros_ok=1
bside_ok=1
gps_ok=1
gps_device_present=1
RECOVERY_ACTION_TAKEN=0
now_sec="$(now_epoch_sec)"
if gps_monitor_enabled; then
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
if (( GPS_DEVICE_PRESENT_STATE == 0 || GPS_STACK_ACTIVE_STATE == 0 )); then
gps_ok=0
fi
fi
if (( BACKOFF_UNTIL > now_sec )); then
fault_reason="backoff"
recovery_state="backoff"
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
continue
fi
if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then
recovery_state="recovering"
elif ! network_is_healthy; then
clear_target_routes_from_5g || true
if fallback_network_is_healthy; then
NETWORK_FAIL_COUNT=0
fault_reason="network_fallback_active"
recovery_state="degraded"
blitz_log "${STEP}" "network-check" "fallback" "interface=${NETWORK_LAST_INTERFACE:-unresolved} target=${BLITZ_TIME_SERVER_IP}" 0
if (( NETWORK_PRIMARY_LAST_RETRY_SEC == 0 || now_sec - NETWORK_PRIMARY_LAST_RETRY_SEC >= 10 )); then
NETWORK_PRIMARY_LAST_RETRY_SEC="${now_sec}"
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
NETWORK_PRIMARY_LAST_RETRY_SEC=0
fault_reason="none"
recovery_state="ok"
blitz_log "${STEP}" "network-check" "primary-restored" "interface=${NETWORK_LAST_INTERFACE} target=${BLITZ_TIME_SERVER_IP}" 0
log_target_route_paths "primary-restored"
fi
fi
else
network_ok=0
NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 ))
fault_reason="network_or_robot_unreachable"
recovery_state="recovering"
blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1
if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then
perform_network_recovery || true
fi
fi
else
NETWORK_PRIMARY_LAST_RETRY_SEC=0
NETWORK_FAIL_COUNT=0
sync_target_routes_to_5g "${NETWORK_LAST_INTERFACE}" || true
fi
if check_gps_health "${now_sec}"; then
gps_ok=1
else
gps_ok=0
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
if [[ "${fault_reason}" == "none" ]]; then
if (( GPS_DEVICE_PRESENT_STATE == 0 )); then
fault_reason="gps_device_missing"
else
fault_reason="gps_reconnect_failed"
fi
recovery_state="degraded"
fi
fi
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
camera_ok=0
fault_reason="camera_missing"
recovery_state="degraded"
CAMERA_MISSING_PREV=1
CAMERA_RECOVERY_STABLE_COUNT=0
elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then
CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 ))
recovery_state="recovering"
fault_reason="camera_recovered"
if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then
restart_bside_targeted "camera" "camera-reappeared" || true
CAMERA_MISSING_PREV=0
CAMERA_RECOVERY_STABLE_COUNT=0
fi
else
CAMERA_RECOVERY_STABLE_COUNT=0
fi
if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then
bside_ok=0
fault_reason="bside_status_stale"
recovery_state="recovering"
restart_bside_targeted "bside" "bside-unhealthy" || true
fi
if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
ros_ok=0
fault_reason="ros_receiver_unhealthy"
recovery_state="recovering"
full_restart_stack "ros-unhealthy" || true
fi
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
done