830 lines
23 KiB
Bash
830 lines
23 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck disable=SC1091
|
|
source "${SCRIPT_DIR}/common.sh"
|
|
|
|
STEP="watchdog"
|
|
B_SIDE_SERVICE="blitz-b-side-omnid.service"
|
|
ROS_SERVICE="blitz-ros-receiver.service"
|
|
B_SIDE_STATUS_FILE=""
|
|
ROS_STATUS_FILE=""
|
|
WATCHDOG_STATUS_FILE=""
|
|
NETWORK_FAULT_FILE=""
|
|
CAMERA_MISSING_PREV=0
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
NETWORK_FAIL_COUNT=0
|
|
NETWORK_COOLDOWN_UNTIL=0
|
|
BACKOFF_UNTIL=0
|
|
LAST_ACTION="none"
|
|
LAST_ACTION_EPOCH_MS=0
|
|
FULL_RESTART_WINDOW_START=0
|
|
FULL_RESTART_WINDOW_COUNT=0
|
|
NETWORK_LAST_INTERFACE=""
|
|
NETWORK_ROUTE_INTERFACE_LAST_KNOWN=""
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC=0
|
|
GPS_LAST_CHECK_SEC=0
|
|
GPS_DEVICE_PRESENT_PREV=-1
|
|
GPS_DEVICE_PRESENT_STATE=1
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
declare -A TARGETED_RESTART_WINDOW_START=()
|
|
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
|
|
|
now_epoch_sec() {
|
|
date +%s
|
|
}
|
|
|
|
now_epoch_ms() {
|
|
date +%s%3N
|
|
}
|
|
|
|
service_is_active() {
|
|
systemctl is-active --quiet "$1"
|
|
}
|
|
|
|
gps_monitor_enabled() {
|
|
[[ "${BLITZ_GPS_MONITOR_ENABLED:-0}" == "1" ]]
|
|
}
|
|
|
|
gps_stack_active() {
|
|
local units=()
|
|
local unit
|
|
|
|
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
|
|
if (( ${#units[@]} == 0 )); then
|
|
return 1
|
|
fi
|
|
|
|
for unit in "${units[@]}"; do
|
|
if service_is_active "${unit}"; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
restart_gps_stack() {
|
|
local reason="$1"
|
|
local devices="$2"
|
|
local units=()
|
|
local rc
|
|
|
|
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
|
|
if (( ${#units[@]} == 0 )); then
|
|
GPS_STACK_ACTIVE_STATE=0
|
|
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=empty" 1
|
|
return 1
|
|
fi
|
|
|
|
set_last_action "gps-reconnect"
|
|
blitz_log "${STEP}" "gps-reconnect" "start" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
|
|
if systemctl restart "${units[@]}"; then
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
blitz_log "${STEP}" "gps-reconnect" "success" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
|
|
return 0
|
|
fi
|
|
|
|
rc=$?
|
|
GPS_STACK_ACTIVE_STATE=0
|
|
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" "${rc}"
|
|
return "${rc}"
|
|
}
|
|
|
|
check_gps_health() {
|
|
local now_sec="$1"
|
|
local check_interval_sec="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
|
|
local device_glob="${BLITZ_GPS_DEVICE_GLOB:-}"
|
|
local previous_present="${GPS_DEVICE_PRESENT_PREV}"
|
|
local recovery_reason=""
|
|
local device_summary=""
|
|
local -a devices=()
|
|
|
|
if ! gps_monitor_enabled; then
|
|
GPS_DEVICE_PRESENT_STATE=1
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
return 0
|
|
fi
|
|
|
|
if (( check_interval_sec < 1 )); then
|
|
check_interval_sec=1
|
|
fi
|
|
if (( GPS_LAST_CHECK_SEC != 0 && now_sec - GPS_LAST_CHECK_SEC < check_interval_sec )); then
|
|
if (( GPS_DEVICE_PRESENT_STATE == 1 && GPS_STACK_ACTIVE_STATE == 1 )); then
|
|
return 0
|
|
fi
|
|
return 1
|
|
fi
|
|
GPS_LAST_CHECK_SEC="${now_sec}"
|
|
|
|
mapfile -t devices < <(compgen -G "${device_glob}" || true)
|
|
if (( ${#devices[@]} == 0 )); then
|
|
GPS_DEVICE_PRESENT_STATE=0
|
|
GPS_STACK_ACTIVE_STATE=0
|
|
if (( previous_present != 0 )); then
|
|
blitz_log "${STEP}" "gps-device-check" "failure" "state=missing glob=${device_glob}" 1
|
|
fi
|
|
GPS_DEVICE_PRESENT_PREV=0
|
|
return 1
|
|
fi
|
|
|
|
device_summary="$(IFS=,; printf '%s' "${devices[*]}")"
|
|
GPS_DEVICE_PRESENT_STATE=1
|
|
GPS_DEVICE_PRESENT_PREV=1
|
|
|
|
if (( previous_present == 0 )); then
|
|
blitz_log "${STEP}" "gps-device-check" "success" "state=reappeared devices=${device_summary}" 0
|
|
recovery_reason="device-reappeared"
|
|
elif ! gps_stack_active; then
|
|
recovery_reason="gpsd-inactive"
|
|
fi
|
|
|
|
if [[ -n "${recovery_reason}" ]]; then
|
|
if restart_gps_stack "${recovery_reason}" "${device_summary}"; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
fi
|
|
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
return 0
|
|
}
|
|
|
|
status_file_fresh() {
|
|
local path="$1"
|
|
local max_age_sec="$2"
|
|
local now_sec
|
|
local mtime_sec
|
|
|
|
if [[ ! -f "${path}" ]]; then
|
|
return 1
|
|
fi
|
|
now_sec="$(now_epoch_sec)"
|
|
mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)"
|
|
(( now_sec - mtime_sec <= max_age_sec ))
|
|
}
|
|
|
|
ros_receiver_status_fresh() {
|
|
local path="$1"
|
|
local max_age_sec="$2"
|
|
local now_epoch_ms_value
|
|
|
|
now_epoch_ms_value="$(now_epoch_ms)"
|
|
python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
now_epoch_ms = int(sys.argv[2])
|
|
max_age_ms = int(sys.argv[3]) * 1000
|
|
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as handle:
|
|
payload = json.load(handle)
|
|
except Exception:
|
|
raise SystemExit(1)
|
|
|
|
heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0)
|
|
socket_bound = bool(payload.get("socket_bound"))
|
|
|
|
if heartbeat_ms <= 0 or not socket_bound:
|
|
raise SystemExit(1)
|
|
|
|
raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1)
|
|
PY
|
|
}
|
|
|
|
ros_receiver_healthy() {
|
|
local max_age_sec="$1"
|
|
|
|
service_is_active "${ROS_SERVICE}" \
|
|
&& [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \
|
|
&& status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \
|
|
&& ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}"
|
|
}
|
|
|
|
write_watchdog_status() {
|
|
local fault_reason="$1"
|
|
local recovery_state="$2"
|
|
local network_ok="$3"
|
|
local camera_ok="$4"
|
|
local ros_ok="$5"
|
|
local bside_ok="$6"
|
|
local gps_ok="$7"
|
|
local gps_device_present="$8"
|
|
local tmp_file
|
|
|
|
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
|
|
cat > "${tmp_file}" <<EOF
|
|
{
|
|
"updated_at_epoch_ms": $(now_epoch_ms),
|
|
"fault_reason": "${fault_reason}",
|
|
"recovery_state": "${recovery_state}",
|
|
"network_ok": ${network_ok},
|
|
"camera_ok": ${camera_ok},
|
|
"ros_ok": ${ros_ok},
|
|
"bside_ok": ${bside_ok},
|
|
"gps_ok": ${gps_ok},
|
|
"gps_device_present": ${gps_device_present},
|
|
"network_fail_count": ${NETWORK_FAIL_COUNT},
|
|
"targeted_restart_count": $(targeted_restart_total),
|
|
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
|
|
"last_action": "${LAST_ACTION}",
|
|
"last_action_epoch_ms": ${LAST_ACTION_EPOCH_MS}
|
|
}
|
|
EOF
|
|
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
|
|
}
|
|
|
|
set_last_action() {
|
|
LAST_ACTION="$1"
|
|
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
|
|
}
|
|
|
|
targeted_restart_total() {
|
|
local total=0
|
|
local key
|
|
|
|
for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do
|
|
total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] ))
|
|
done
|
|
printf '%s\n' "${total}"
|
|
}
|
|
|
|
register_targeted_restart() {
|
|
local fault_key="$1"
|
|
local now_sec
|
|
local window_start
|
|
local count
|
|
|
|
now_sec="$(now_epoch_sec)"
|
|
window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}"
|
|
count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}"
|
|
if (( window_start == 0 || now_sec - window_start > 60 )); then
|
|
window_start="${now_sec}"
|
|
count=1
|
|
else
|
|
count=$(( count + 1 ))
|
|
fi
|
|
TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}"
|
|
TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}"
|
|
(( count >= 2 ))
|
|
}
|
|
|
|
record_full_restart() {
|
|
local now_sec
|
|
|
|
now_sec="$(now_epoch_sec)"
|
|
if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then
|
|
FULL_RESTART_WINDOW_START="${now_sec}"
|
|
FULL_RESTART_WINDOW_COUNT=1
|
|
else
|
|
FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 ))
|
|
fi
|
|
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
|
|
BACKOFF_UNTIL=$(( now_sec + 60 ))
|
|
fi
|
|
}
|
|
|
|
restart_bside_targeted() {
|
|
local fault_key="$1"
|
|
local reason="$2"
|
|
local rc
|
|
|
|
if register_targeted_restart "${fault_key}"; then
|
|
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
|
|
full_restart_stack "${reason}-escalated"
|
|
return 0
|
|
fi
|
|
|
|
set_last_action "restart-bside"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
|
|
if systemctl restart "${B_SIDE_SERVICE}"; then
|
|
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
|
|
return 0
|
|
fi
|
|
|
|
rc=$?
|
|
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
|
|
return "${rc}"
|
|
}
|
|
|
|
full_restart_stack() {
|
|
local reason="$1"
|
|
local rc
|
|
|
|
set_last_action "full-restart"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
recovery_state="recovering"
|
|
fault_reason="${reason}"
|
|
|
|
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
|
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
|
|
|
if systemctl restart "${ROS_SERVICE}"; then
|
|
blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
|
|
if bash "${BOOT_SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
|
|
if systemctl start "${B_SIDE_SERVICE}"; then
|
|
blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
record_full_restart
|
|
}
|
|
|
|
network_fault_injected() {
|
|
[[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]]
|
|
}
|
|
|
|
resolve_network_interface() {
|
|
NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)"
|
|
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
|
|
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${NETWORK_LAST_INTERFACE}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
network_route_targets() {
|
|
local target
|
|
|
|
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
|
|
fi
|
|
for target in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
|
|
if [[ -n "${target}" && "${target}" != "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
printf '%s\n' "${target}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
log_target_route_paths() {
|
|
local action="$1"
|
|
local target
|
|
local route_output
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
route_output="$(ip route get "${target}" 2>&1 | head -n 1 || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
route_output="unresolved"
|
|
fi
|
|
blitz_log "${STEP}" "route-path" "info" "action=${action} target=${target} route=${route_output}" 0
|
|
done < <(network_route_targets)
|
|
}
|
|
|
|
route_output_uses_interface() {
|
|
local route_output="$1"
|
|
local interface_name="$2"
|
|
|
|
[[ -n "${interface_name}" ]] || return 1
|
|
[[ "${route_output}" == *" dev ${interface_name} "* || "${route_output}" == *" dev ${interface_name}" ]]
|
|
}
|
|
|
|
route_output_uses_gateway() {
|
|
local route_output="$1"
|
|
local gateway="$2"
|
|
|
|
[[ -n "${gateway}" ]] || return 1
|
|
[[ "${route_output}" == *"via ${gateway}"* ]]
|
|
}
|
|
|
|
route_is_desired_target_route() {
|
|
local route_output="$1"
|
|
local interface_name="$2"
|
|
local gateway="$3"
|
|
|
|
route_output_uses_interface "${route_output}" "${interface_name}" \
|
|
&& route_output_uses_gateway "${route_output}" "${gateway}"
|
|
}
|
|
|
|
route_is_managed_5g_route() {
|
|
local route_output="$1"
|
|
local interface_name="${2:-}"
|
|
local gateway="${3:-}"
|
|
|
|
if route_output_uses_interface "${route_output}" "${interface_name}"; then
|
|
return 0
|
|
fi
|
|
if route_output_uses_gateway "${route_output}" "${gateway}"; then
|
|
return 0
|
|
fi
|
|
if route_output_uses_gateway "${route_output}" "${BLITZ_5G_GATEWAY:-}"; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
resolve_route_cleanup_interface() {
|
|
local interface_name=""
|
|
local info_json="${BLITZ_5G_INFO_JSON:-}"
|
|
|
|
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
|
|
printf '%s\n' "${NETWORK_LAST_INTERFACE}"
|
|
return 0
|
|
fi
|
|
if [[ -n "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" ]]; then
|
|
printf '%s\n' "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}"
|
|
return 0
|
|
fi
|
|
|
|
interface_name="$(blitz_read_5g_info_interface "${info_json}" || true)"
|
|
if [[ -n "${interface_name}" ]]; then
|
|
printf '%s\n' "${interface_name}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
resolve_network_gateway() {
|
|
local interface_name="$1"
|
|
local default_route
|
|
local gateway=""
|
|
local tokens=()
|
|
local index
|
|
|
|
default_route="$(ip -o route show default dev "${interface_name}" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -n "${default_route}" ]]; then
|
|
read -r -a tokens <<< "${default_route}"
|
|
for (( index=0; index<${#tokens[@]}-1; index++ )); do
|
|
if [[ "${tokens[index]}" == "via" ]]; then
|
|
gateway="${tokens[index + 1]}"
|
|
break
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [[ -n "${gateway}" ]]; then
|
|
printf '%s\n' "${gateway}"
|
|
return 0
|
|
fi
|
|
if [[ -n "${BLITZ_5G_GATEWAY:-}" ]]; then
|
|
printf '%s\n' "${BLITZ_5G_GATEWAY}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
sync_target_routes_to_5g() {
|
|
local interface_name="$1"
|
|
local gateway="${2:-}"
|
|
local route_output=""
|
|
local updated=0
|
|
local target
|
|
local rc
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
if [[ -z "${gateway}" ]]; then
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
fi
|
|
if [[ -z "${gateway}" ]]; then
|
|
blitz_log "${STEP}" "route-sync-gateway" "failure" "interface=${interface_name}" 1
|
|
return 1
|
|
fi
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -n "${route_output}" ]] && route_is_desired_target_route "${route_output}" "${interface_name}" "${gateway}"; then
|
|
continue
|
|
fi
|
|
if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then
|
|
updated=1
|
|
blitz_log "${STEP}" "route-sync-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "route-sync-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
done < <(network_route_targets)
|
|
|
|
if (( updated == 1 )); then
|
|
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${interface_name}"
|
|
log_target_route_paths "sync-to-5g"
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
clear_target_routes_from_5g() {
|
|
local interface_name="${1:-}"
|
|
local gateway="${2:-}"
|
|
local route_output=""
|
|
local target
|
|
local removed_any=0
|
|
local rc
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
interface_name="$(resolve_route_cleanup_interface || true)"
|
|
fi
|
|
if [[ -z "${gateway}" && -n "${interface_name}" ]]; then
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
fi
|
|
if [[ -z "${gateway}" ]]; then
|
|
gateway="${BLITZ_5G_GATEWAY:-}"
|
|
fi
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -z "${route_output}" ]] || ! route_is_managed_5g_route "${route_output}" "${interface_name}" "${gateway}"; then
|
|
continue
|
|
fi
|
|
if ip route del "${target}/32"; then
|
|
removed_any=1
|
|
blitz_log "${STEP}" "route-clear-target" "success" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "route-clear-target" "failure" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
done < <(network_route_targets)
|
|
|
|
if (( removed_any == 1 )); then
|
|
blitz_log "${STEP}" "route-clear" "success" "interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
|
|
log_target_route_paths "clear-from-5g"
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
repair_network_routes() {
|
|
local interface_name="$1"
|
|
local gateway=""
|
|
local route_output
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
if [[ -z "${gateway}" ]]; then
|
|
blitz_log "${STEP}" "route-repair-gateway" "failure" "interface=${interface_name}" 1
|
|
return 1
|
|
fi
|
|
|
|
if ! sync_target_routes_to_5g "${interface_name}" "${gateway}"; then
|
|
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
|
|
return 1
|
|
fi
|
|
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
|
|
blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1
|
|
return 1
|
|
fi
|
|
|
|
if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then
|
|
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
|
|
blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1
|
|
return 1
|
|
fi
|
|
|
|
blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0
|
|
return 0
|
|
}
|
|
|
|
network_is_healthy() {
|
|
local route_output
|
|
|
|
NETWORK_LAST_INTERFACE=""
|
|
if network_fault_injected; then
|
|
return 1
|
|
fi
|
|
if ! resolve_network_interface; then
|
|
return 1
|
|
fi
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
return 1
|
|
fi
|
|
ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
|
|
}
|
|
|
|
fallback_network_is_healthy() {
|
|
local route_output
|
|
|
|
if [[ -z "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
ping -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
|
|
}
|
|
|
|
wait_for_network_recovery() {
|
|
local timeout_sec="$1"
|
|
local waited=0
|
|
|
|
while (( waited < timeout_sec )); do
|
|
if network_is_healthy; then
|
|
blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0
|
|
return 0
|
|
fi
|
|
if (( waited == 0 || waited % 5 == 0 )); then
|
|
blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0
|
|
fi
|
|
sleep 1
|
|
waited=$(( waited + 1 ))
|
|
done
|
|
|
|
blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1
|
|
return 1
|
|
}
|
|
|
|
perform_network_recovery() {
|
|
local rc=0
|
|
|
|
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
|
set_last_action "route-repair"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
|
NETWORK_FAIL_COUNT=0
|
|
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
|
|
return 0
|
|
fi
|
|
|
|
set_last_action "network-recovery"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
|
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
|
|
|
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
|
|
if wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
|
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
|
NETWORK_FAIL_COUNT=0
|
|
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
|
restart_bside_targeted "network" "network-recovered"
|
|
return 0
|
|
fi
|
|
full_restart_stack "network-recovered-ros-unhealthy"
|
|
return 0
|
|
}
|
|
|
|
blitz_load_boot_env
|
|
blitz_require_root "${STEP}"
|
|
blitz_require_command systemctl "${STEP}"
|
|
blitz_require_command stat "${STEP}"
|
|
blitz_require_command ping "${STEP}"
|
|
blitz_require_command python3 "${STEP}"
|
|
blitz_prepare_runtime_dir
|
|
|
|
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
|
|
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
|
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
|
|
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
|
|
|
|
while true; do
|
|
fault_reason="none"
|
|
recovery_state="ok"
|
|
network_ok=1
|
|
camera_ok=1
|
|
ros_ok=1
|
|
bside_ok=1
|
|
gps_ok=1
|
|
gps_device_present=1
|
|
RECOVERY_ACTION_TAKEN=0
|
|
now_sec="$(now_epoch_sec)"
|
|
|
|
if gps_monitor_enabled; then
|
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
|
if (( GPS_DEVICE_PRESENT_STATE == 0 || GPS_STACK_ACTIVE_STATE == 0 )); then
|
|
gps_ok=0
|
|
fi
|
|
fi
|
|
|
|
if (( BACKOFF_UNTIL > now_sec )); then
|
|
fault_reason="backoff"
|
|
recovery_state="backoff"
|
|
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
|
continue
|
|
fi
|
|
|
|
if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then
|
|
recovery_state="recovering"
|
|
elif ! network_is_healthy; then
|
|
clear_target_routes_from_5g || true
|
|
if fallback_network_is_healthy; then
|
|
NETWORK_FAIL_COUNT=0
|
|
fault_reason="network_fallback_active"
|
|
recovery_state="degraded"
|
|
blitz_log "${STEP}" "network-check" "fallback" "interface=${NETWORK_LAST_INTERFACE:-unresolved} target=${BLITZ_TIME_SERVER_IP}" 0
|
|
if (( NETWORK_PRIMARY_LAST_RETRY_SEC == 0 || now_sec - NETWORK_PRIMARY_LAST_RETRY_SEC >= 10 )); then
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC="${now_sec}"
|
|
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC=0
|
|
fault_reason="none"
|
|
recovery_state="ok"
|
|
blitz_log "${STEP}" "network-check" "primary-restored" "interface=${NETWORK_LAST_INTERFACE} target=${BLITZ_TIME_SERVER_IP}" 0
|
|
log_target_route_paths "primary-restored"
|
|
fi
|
|
fi
|
|
else
|
|
network_ok=0
|
|
NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 ))
|
|
fault_reason="network_or_robot_unreachable"
|
|
recovery_state="recovering"
|
|
blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1
|
|
if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then
|
|
perform_network_recovery || true
|
|
fi
|
|
fi
|
|
else
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC=0
|
|
NETWORK_FAIL_COUNT=0
|
|
sync_target_routes_to_5g "${NETWORK_LAST_INTERFACE}" || true
|
|
fi
|
|
|
|
if check_gps_health "${now_sec}"; then
|
|
gps_ok=1
|
|
else
|
|
gps_ok=0
|
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
|
if [[ "${fault_reason}" == "none" ]]; then
|
|
if (( GPS_DEVICE_PRESENT_STATE == 0 )); then
|
|
fault_reason="gps_device_missing"
|
|
else
|
|
fault_reason="gps_reconnect_failed"
|
|
fi
|
|
recovery_state="degraded"
|
|
fi
|
|
fi
|
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
|
|
|
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
|
|
camera_ok=0
|
|
fault_reason="camera_missing"
|
|
recovery_state="degraded"
|
|
CAMERA_MISSING_PREV=1
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then
|
|
CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 ))
|
|
recovery_state="recovering"
|
|
fault_reason="camera_recovered"
|
|
if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then
|
|
restart_bside_targeted "camera" "camera-reappeared" || true
|
|
CAMERA_MISSING_PREV=0
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
fi
|
|
else
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
fi
|
|
|
|
if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then
|
|
bside_ok=0
|
|
fault_reason="bside_status_stale"
|
|
recovery_state="recovering"
|
|
restart_bside_targeted "bside" "bside-unhealthy" || true
|
|
fi
|
|
|
|
if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
|
ros_ok=0
|
|
fault_reason="ros_receiver_unhealthy"
|
|
recovery_state="recovering"
|
|
full_restart_stack "ros-unhealthy" || true
|
|
fi
|
|
|
|
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
|
done
|