#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC1091 source "${SCRIPT_DIR}/common.sh" STEP="watchdog" B_SIDE_SERVICE="blitz-b-side-omnid.service" ROS_SERVICE="blitz-ros-receiver.service" B_SIDE_STATUS_FILE="" ROS_STATUS_FILE="" WATCHDOG_STATUS_FILE="" NETWORK_FAULT_FILE="" WATCHDOG_EVENT_LOG="" WATCHDOG_SAMPLE_LOG="" WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0 WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0 CAMERA_MISSING_PREV=0 CAMERA_RECOVERY_STABLE_COUNT=0 NETWORK_FAIL_COUNT=0 NETWORK_COOLDOWN_UNTIL=0 BACKOFF_UNTIL=0 LAST_ACTION="none" LAST_ACTION_EPOCH_MS=0 FULL_RESTART_WINDOW_START=0 FULL_RESTART_WINDOW_COUNT=0 NETWORK_LAST_INTERFACE="" NETWORK_ROUTE_INTERFACE_LAST_KNOWN="" NETWORK_PRIMARY_LAST_RETRY_SEC=0 GPS_LAST_CHECK_SEC=0 GPS_DEVICE_PRESENT_PREV=-1 GPS_DEVICE_PRESENT_STATE=1 GPS_STACK_ACTIVE_STATE=1 LAST_REPORTED_FAULT_REASON="" LAST_REPORTED_RECOVERY_STATE="" declare -A TARGETED_RESTART_WINDOW_START=() declare -A TARGETED_RESTART_WINDOW_COUNT=() now_epoch_sec() { date +%s } now_epoch_ms() { date +%s%3N } service_is_active() { systemctl is-active --quiet "$1" } gps_monitor_enabled() { [[ "${BLITZ_GPS_MONITOR_ENABLED:-0}" == "1" ]] } gps_stack_active() { local units=() local unit read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}" if (( ${#units[@]} == 0 )); then return 1 fi for unit in "${units[@]}"; do if service_is_active "${unit}"; then return 0 fi done return 1 } restart_gps_stack() { local reason="$1" local devices="$2" local units=() local rc read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}" if (( ${#units[@]} == 0 )); then GPS_STACK_ACTIVE_STATE=0 blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=empty" 1 return 1 fi set_last_action "gps-reconnect" blitz_log "${STEP}" "gps-reconnect" "start" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0 if systemctl restart "${units[@]}"; then GPS_STACK_ACTIVE_STATE=1 blitz_log "${STEP}" "gps-reconnect" "success" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0 return 0 fi rc=$? GPS_STACK_ACTIVE_STATE=0 blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" "${rc}" return "${rc}" } check_gps_health() { local now_sec="$1" local check_interval_sec="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}" local device_glob="${BLITZ_GPS_DEVICE_GLOB:-}" local previous_present="${GPS_DEVICE_PRESENT_PREV}" local recovery_reason="" local device_summary="" local -a devices=() if ! gps_monitor_enabled; then GPS_DEVICE_PRESENT_STATE=1 GPS_STACK_ACTIVE_STATE=1 return 0 fi if (( check_interval_sec < 1 )); then check_interval_sec=1 fi if (( GPS_LAST_CHECK_SEC != 0 && now_sec - GPS_LAST_CHECK_SEC < check_interval_sec )); then if (( GPS_DEVICE_PRESENT_STATE == 1 && GPS_STACK_ACTIVE_STATE == 1 )); then return 0 fi return 1 fi GPS_LAST_CHECK_SEC="${now_sec}" mapfile -t devices < <(compgen -G "${device_glob}" || true) if (( ${#devices[@]} == 0 )); then GPS_DEVICE_PRESENT_STATE=0 GPS_STACK_ACTIVE_STATE=0 if (( previous_present != 0 )); then blitz_log "${STEP}" "gps-device-check" "failure" "state=missing glob=${device_glob}" 1 fi GPS_DEVICE_PRESENT_PREV=0 return 1 fi device_summary="$(IFS=,; printf '%s' "${devices[*]}")" GPS_DEVICE_PRESENT_STATE=1 GPS_DEVICE_PRESENT_PREV=1 if (( previous_present == 0 )); then blitz_log "${STEP}" "gps-device-check" "success" "state=reappeared devices=${device_summary}" 0 recovery_reason="device-reappeared" elif ! gps_stack_active; then recovery_reason="gpsd-inactive" fi if [[ -n "${recovery_reason}" ]]; then if restart_gps_stack "${recovery_reason}" "${device_summary}"; then return 0 fi return 1 fi GPS_STACK_ACTIVE_STATE=1 return 0 } status_file_fresh() { local path="$1" local max_age_sec="$2" local now_sec local mtime_sec if [[ ! -f "${path}" ]]; then return 1 fi now_sec="$(now_epoch_sec)" mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)" (( now_sec - mtime_sec <= max_age_sec )) } ros_receiver_status_fresh() { local path="$1" local max_age_sec="$2" local now_epoch_ms_value now_epoch_ms_value="$(now_epoch_ms)" python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY' import json import sys path = sys.argv[1] now_epoch_ms = int(sys.argv[2]) max_age_ms = int(sys.argv[3]) * 1000 try: with open(path, "r", encoding="utf-8") as handle: payload = json.load(handle) except Exception: raise SystemExit(1) heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0) socket_bound = bool(payload.get("socket_bound")) if heartbeat_ms <= 0 or not socket_bound: raise SystemExit(1) raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1) PY } ros_receiver_healthy() { local max_age_sec="$1" service_is_active "${ROS_SERVICE}" \ && [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \ && status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \ && ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" } write_watchdog_status() { local fault_reason="$1" local recovery_state="$2" local network_ok="$3" local camera_ok="$4" local ros_ok="$5" local bside_ok="$6" local gps_ok="$7" local gps_device_present="$8" local tmp_file tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$" cat > "${tmp_file}" <&1)"; then if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1 fi return 0 fi if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1 fi return 0 fi WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0 } watchdog_append_sample() { local line="" [[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0 if ! line="$(watchdog_emit_json "$@" 2>&1)"; then if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1 fi return 0 fi if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1 fi return 0 fi WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0 } watchdog_record_state_transition() { local fault_reason="$1" local recovery_state="$2" if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then return 0 fi watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" "" LAST_REPORTED_FAULT_REASON="${fault_reason}" LAST_REPORTED_RECOVERY_STATE="${recovery_state}" } watchdog_launch_incident() { local reason="$1" local unit_name="$2" "${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" \ --source watchdog \ --reason "${reason}" \ --unit "${unit_name}" \ --result failure \ --exit-status 1 2>/dev/null || true } set_last_action() { LAST_ACTION="$1" LAST_ACTION_EPOCH_MS="$(now_epoch_ms)" } targeted_restart_total() { local total=0 local key for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] )) done printf '%s\n' "${total}" } register_targeted_restart() { local fault_key="$1" local now_sec local window_start local count now_sec="$(now_epoch_sec)" window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}" count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}" if (( window_start == 0 || now_sec - window_start > 60 )); then window_start="${now_sec}" count=1 else count=$(( count + 1 )) fi TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}" TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}" (( count >= 2 )) } record_full_restart() { local now_sec now_sec="$(now_epoch_sec)" if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then FULL_RESTART_WINDOW_START="${now_sec}" FULL_RESTART_WINDOW_COUNT=1 else FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 )) fi if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then BACKOFF_UNTIL=$(( now_sec + 60 )) watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" "" fi } restart_bside_targeted() { local fault_key="$1" local reason="$2" local rc local incident_id="" if register_targeted_restart "${fault_key}"; then blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0 watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" "" full_restart_stack "${reason}-escalated" return 0 fi incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")" set_last_action "restart-bside" RECOVERY_ACTION_TAKEN=1 blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0 watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}" if systemctl restart "${B_SIDE_SERVICE}"; then blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0 watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}" return 0 fi rc=$? blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}" watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}" return "${rc}" } full_restart_stack() { local reason="$1" local rc local incident_id="" incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")" set_last_action "full-restart" RECOVERY_ACTION_TAKEN=1 recovery_state="recovering" fault_reason="${reason}" blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0 watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}" systemctl stop "${B_SIDE_SERVICE}" || true if systemctl restart "${ROS_SERVICE}"; then blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0 else rc=$? blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}" record_full_restart return "${rc}" fi if bash "${BOOT_SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then : else rc=$? blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}" record_full_restart return "${rc}" fi if systemctl start "${B_SIDE_SERVICE}"; then blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0 else rc=$? blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}" watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}" record_full_restart return "${rc}" fi watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}" record_full_restart } network_fault_injected() { [[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]] } resolve_network_interface() { NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)" if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${NETWORK_LAST_INTERFACE}" return 0 fi return 1 } network_route_targets() { local target if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then printf '%s\n' "${BLITZ_TIME_SERVER_IP}" fi for target in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do if [[ -n "${target}" && "${target}" != "${BLITZ_TIME_SERVER_IP:-}" ]]; then printf '%s\n' "${target}" fi done } log_target_route_paths() { local action="$1" local target local route_output while IFS= read -r target; do [[ -n "${target}" ]] || continue route_output="$(ip route get "${target}" 2>&1 | head -n 1 || true)" if [[ -z "${route_output}" ]]; then route_output="unresolved" fi blitz_log "${STEP}" "route-path" "info" "action=${action} target=${target} route=${route_output}" 0 done < <(network_route_targets) } route_output_uses_interface() { local route_output="$1" local interface_name="$2" [[ -n "${interface_name}" ]] || return 1 [[ "${route_output}" == *" dev ${interface_name} "* || "${route_output}" == *" dev ${interface_name}" ]] } route_output_uses_gateway() { local route_output="$1" local gateway="$2" [[ -n "${gateway}" ]] || return 1 [[ "${route_output}" == *"via ${gateway}"* ]] } route_is_desired_target_route() { local route_output="$1" local interface_name="$2" local gateway="$3" route_output_uses_interface "${route_output}" "${interface_name}" \ && route_output_uses_gateway "${route_output}" "${gateway}" } route_is_managed_5g_route() { local route_output="$1" local interface_name="${2:-}" local gateway="${3:-}" if route_output_uses_interface "${route_output}" "${interface_name}"; then return 0 fi if route_output_uses_gateway "${route_output}" "${gateway}"; then return 0 fi if route_output_uses_gateway "${route_output}" "${BLITZ_5G_GATEWAY:-}"; then return 0 fi return 1 } resolve_route_cleanup_interface() { local interface_name="" local info_json="${BLITZ_5G_INFO_JSON:-}" if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then printf '%s\n' "${NETWORK_LAST_INTERFACE}" return 0 fi if [[ -n "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" ]]; then printf '%s\n' "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" return 0 fi interface_name="$(blitz_read_5g_info_interface "${info_json}" || true)" if [[ -n "${interface_name}" ]]; then printf '%s\n' "${interface_name}" return 0 fi return 1 } resolve_network_gateway() { local interface_name="$1" local default_route local gateway="" local tokens=() local index default_route="$(ip -o route show default dev "${interface_name}" 2>/dev/null | head -n 1 || true)" if [[ -n "${default_route}" ]]; then read -r -a tokens <<< "${default_route}" for (( index=0; index<${#tokens[@]}-1; index++ )); do if [[ "${tokens[index]}" == "via" ]]; then gateway="${tokens[index + 1]}" break fi done fi if [[ -n "${gateway}" ]]; then printf '%s\n' "${gateway}" return 0 fi if [[ -n "${BLITZ_5G_GATEWAY:-}" ]]; then printf '%s\n' "${BLITZ_5G_GATEWAY}" return 0 fi return 1 } sync_target_routes_to_5g() { local interface_name="$1" local gateway="${2:-}" local route_output="" local updated=0 local target local rc if [[ -z "${interface_name}" ]]; then return 1 fi if [[ -z "${gateway}" ]]; then gateway="$(resolve_network_gateway "${interface_name}" || true)" fi if [[ -z "${gateway}" ]]; then blitz_log "${STEP}" "route-sync-gateway" "failure" "interface=${interface_name}" 1 return 1 fi while IFS= read -r target; do [[ -n "${target}" ]] || continue route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)" if [[ -n "${route_output}" ]] && route_is_desired_target_route "${route_output}" "${interface_name}" "${gateway}"; then continue fi if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then updated=1 blitz_log "${STEP}" "route-sync-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0 else rc=$? blitz_log "${STEP}" "route-sync-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}" return "${rc}" fi done < <(network_route_targets) if (( updated == 1 )); then NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${interface_name}" log_target_route_paths "sync-to-5g" fi return 0 } clear_target_routes_from_5g() { local interface_name="${1:-}" local gateway="${2:-}" local route_output="" local target local removed_any=0 local rc if [[ -z "${interface_name}" ]]; then interface_name="$(resolve_route_cleanup_interface || true)" fi if [[ -z "${gateway}" && -n "${interface_name}" ]]; then gateway="$(resolve_network_gateway "${interface_name}" || true)" fi if [[ -z "${gateway}" ]]; then gateway="${BLITZ_5G_GATEWAY:-}" fi while IFS= read -r target; do [[ -n "${target}" ]] || continue route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)" if [[ -z "${route_output}" ]] || ! route_is_managed_5g_route "${route_output}" "${interface_name}" "${gateway}"; then continue fi if ip route del "${target}/32"; then removed_any=1 blitz_log "${STEP}" "route-clear-target" "success" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0 else rc=$? blitz_log "${STEP}" "route-clear-target" "failure" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" "${rc}" return "${rc}" fi done < <(network_route_targets) if (( removed_any == 1 )); then blitz_log "${STEP}" "route-clear" "success" "interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0 log_target_route_paths "clear-from-5g" fi return 0 } repair_network_routes() { local interface_name="$1" local gateway="" local route_output if [[ -z "${interface_name}" ]]; then return 1 fi gateway="$(resolve_network_gateway "${interface_name}" || true)" if [[ -z "${gateway}" ]]; then blitz_log "${STEP}" "route-repair-gateway" "failure" "interface=${interface_name}" 1 return 1 fi if ! sync_target_routes_to_5g "${interface_name}" "${gateway}"; then clear_target_routes_from_5g "${interface_name}" "${gateway}" || true return 1 fi route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)" if [[ -z "${route_output}" ]]; then clear_target_routes_from_5g "${interface_name}" "${gateway}" || true blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1 return 1 fi if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then clear_target_routes_from_5g "${interface_name}" "${gateway}" || true blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1 return 1 fi blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0 return 0 } network_is_healthy() { local route_output NETWORK_LAST_INTERFACE="" if network_fault_injected; then return 1 fi if ! resolve_network_interface; then return 1 fi route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)" if [[ -z "${route_output}" ]]; then return 1 fi ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1 } fallback_network_is_healthy() { local route_output if [[ -z "${BLITZ_TIME_SERVER_IP:-}" ]]; then return 1 fi route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" || true)" if [[ -z "${route_output}" ]]; then return 1 fi ping -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1 } wait_for_network_recovery() { local timeout_sec="$1" local waited=0 while (( waited < timeout_sec )); do if network_is_healthy; then blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0 return 0 fi if (( waited == 0 || waited % 5 == 0 )); then blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0 fi sleep 1 waited=$(( waited + 1 )) done blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1 return 1 } perform_network_recovery() { local rc=0 local incident_id="" if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then set_last_action "route-repair" RECOVERY_ACTION_TAKEN=1 NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC )) NETWORK_FAIL_COUNT=0 blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0 watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" "" return 0 fi incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")" set_last_action "network-recovery" RECOVERY_ACTION_TAKEN=1 blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0 watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}" systemctl stop "${B_SIDE_SERVICE}" || true if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then : else rc=$? blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}" watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}" return "${rc}" fi if wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then : else rc=$? blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}" watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}" return "${rc}" fi NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC )) NETWORK_FAIL_COUNT=0 watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}" if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then restart_bside_targeted "network" "network-recovered" return 0 fi full_restart_stack "network-recovered-ros-unhealthy" return 0 } blitz_load_boot_env blitz_require_root "${STEP}" blitz_require_command systemctl "${STEP}" blitz_require_command stat "${STEP}" blitz_require_command ping "${STEP}" blitz_require_command python3 "${STEP}" blitz_prepare_runtime_dir blitz_require_run_context B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json" ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json" WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json" NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down" WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl" WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl" while true; do fault_reason="none" recovery_state="ok" network_ok=1 camera_ok=1 ros_ok=1 bside_ok=1 gps_ok=1 gps_device_present=1 RECOVERY_ACTION_TAKEN=0 now_sec="$(now_epoch_sec)" if gps_monitor_enabled; then gps_device_present="${GPS_DEVICE_PRESENT_STATE}" if (( GPS_DEVICE_PRESENT_STATE == 0 || GPS_STACK_ACTIVE_STATE == 0 )); then gps_ok=0 fi fi if (( BACKOFF_UNTIL > now_sec )); then fault_reason="backoff" recovery_state="backoff" watchdog_record_state_transition "${fault_reason}" "${recovery_state}" write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}" watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}" sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}" continue fi if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then recovery_state="recovering" elif ! network_is_healthy; then clear_target_routes_from_5g || true if fallback_network_is_healthy; then NETWORK_FAIL_COUNT=0 fault_reason="network_fallback_active" recovery_state="degraded" blitz_log "${STEP}" "network-check" "fallback" "interface=${NETWORK_LAST_INTERFACE:-unresolved} target=${BLITZ_TIME_SERVER_IP}" 0 if (( NETWORK_PRIMARY_LAST_RETRY_SEC == 0 || now_sec - NETWORK_PRIMARY_LAST_RETRY_SEC >= 10 )); then NETWORK_PRIMARY_LAST_RETRY_SEC="${now_sec}" if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then NETWORK_PRIMARY_LAST_RETRY_SEC=0 fault_reason="none" recovery_state="ok" blitz_log "${STEP}" "network-check" "primary-restored" "interface=${NETWORK_LAST_INTERFACE} target=${BLITZ_TIME_SERVER_IP}" 0 log_target_route_paths "primary-restored" fi fi else network_ok=0 NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 )) fault_reason="network_or_robot_unreachable" recovery_state="recovering" blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1 if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then perform_network_recovery || true fi fi else NETWORK_PRIMARY_LAST_RETRY_SEC=0 NETWORK_FAIL_COUNT=0 sync_target_routes_to_5g "${NETWORK_LAST_INTERFACE}" || true fi if check_gps_health "${now_sec}"; then gps_ok=1 else gps_ok=0 gps_device_present="${GPS_DEVICE_PRESENT_STATE}" if [[ "${fault_reason}" == "none" ]]; then if (( GPS_DEVICE_PRESENT_STATE == 0 )); then fault_reason="gps_device_missing" else fault_reason="gps_reconnect_failed" fi recovery_state="degraded" fi fi gps_device_present="${GPS_DEVICE_PRESENT_STATE}" if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then camera_ok=0 fault_reason="camera_missing" recovery_state="degraded" CAMERA_MISSING_PREV=1 CAMERA_RECOVERY_STABLE_COUNT=0 elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 )) recovery_state="recovering" fault_reason="camera_recovered" if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then restart_bside_targeted "camera" "camera-reappeared" || true CAMERA_MISSING_PREV=0 CAMERA_RECOVERY_STABLE_COUNT=0 fi else CAMERA_RECOVERY_STABLE_COUNT=0 fi if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then bside_ok=0 fault_reason="bside_status_stale" recovery_state="recovering" restart_bside_targeted "bside" "bside-unhealthy" || true fi if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then ros_ok=0 fault_reason="ros_receiver_unhealthy" recovery_state="recovering" full_restart_stack "ros-unhealthy" || true fi watchdog_record_state_transition "${fault_reason}" "${recovery_state}" write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}" done