972 lines
30 KiB
Bash
972 lines
30 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck disable=SC1091
|
|
source "${SCRIPT_DIR}/common.sh"
|
|
|
|
STEP="watchdog"
|
|
B_SIDE_SERVICE="blitz-b-side-omnid.service"
|
|
ROS_SERVICE="blitz-ros-receiver.service"
|
|
B_SIDE_STATUS_FILE=""
|
|
ROS_STATUS_FILE=""
|
|
WATCHDOG_STATUS_FILE=""
|
|
NETWORK_FAULT_FILE=""
|
|
WATCHDOG_EVENT_LOG=""
|
|
WATCHDOG_SAMPLE_LOG=""
|
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
|
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
|
|
CAMERA_MISSING_PREV=0
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
NETWORK_FAIL_COUNT=0
|
|
NETWORK_COOLDOWN_UNTIL=0
|
|
BACKOFF_UNTIL=0
|
|
LAST_ACTION="none"
|
|
LAST_ACTION_EPOCH_MS=0
|
|
FULL_RESTART_WINDOW_START=0
|
|
FULL_RESTART_WINDOW_COUNT=0
|
|
NETWORK_LAST_INTERFACE=""
|
|
NETWORK_ROUTE_INTERFACE_LAST_KNOWN=""
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC=0
|
|
GPS_LAST_CHECK_SEC=0
|
|
GPS_DEVICE_PRESENT_PREV=-1
|
|
GPS_DEVICE_PRESENT_STATE=1
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
LAST_REPORTED_FAULT_REASON=""
|
|
LAST_REPORTED_RECOVERY_STATE=""
|
|
declare -A TARGETED_RESTART_WINDOW_START=()
|
|
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
|
|
|
now_epoch_sec() {
|
|
date +%s
|
|
}
|
|
|
|
now_epoch_ms() {
|
|
date +%s%3N
|
|
}
|
|
|
|
service_is_active() {
|
|
systemctl is-active --quiet "$1"
|
|
}
|
|
|
|
gps_monitor_enabled() {
|
|
[[ "${BLITZ_GPS_MONITOR_ENABLED:-0}" == "1" ]]
|
|
}
|
|
|
|
gps_stack_active() {
|
|
local units=()
|
|
local unit
|
|
|
|
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
|
|
if (( ${#units[@]} == 0 )); then
|
|
return 1
|
|
fi
|
|
|
|
for unit in "${units[@]}"; do
|
|
if service_is_active "${unit}"; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
restart_gps_stack() {
|
|
local reason="$1"
|
|
local devices="$2"
|
|
local units=()
|
|
local rc
|
|
|
|
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
|
|
if (( ${#units[@]} == 0 )); then
|
|
GPS_STACK_ACTIVE_STATE=0
|
|
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=empty" 1
|
|
return 1
|
|
fi
|
|
|
|
set_last_action "gps-reconnect"
|
|
blitz_log "${STEP}" "gps-reconnect" "start" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
|
|
if systemctl restart "${units[@]}"; then
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
blitz_log "${STEP}" "gps-reconnect" "success" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
|
|
return 0
|
|
fi
|
|
|
|
rc=$?
|
|
GPS_STACK_ACTIVE_STATE=0
|
|
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" "${rc}"
|
|
return "${rc}"
|
|
}
|
|
|
|
check_gps_health() {
|
|
local now_sec="$1"
|
|
local check_interval_sec="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
|
|
local device_glob="${BLITZ_GPS_DEVICE_GLOB:-}"
|
|
local previous_present="${GPS_DEVICE_PRESENT_PREV}"
|
|
local recovery_reason=""
|
|
local device_summary=""
|
|
local -a devices=()
|
|
|
|
if ! gps_monitor_enabled; then
|
|
GPS_DEVICE_PRESENT_STATE=1
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
return 0
|
|
fi
|
|
|
|
if (( check_interval_sec < 1 )); then
|
|
check_interval_sec=1
|
|
fi
|
|
if (( GPS_LAST_CHECK_SEC != 0 && now_sec - GPS_LAST_CHECK_SEC < check_interval_sec )); then
|
|
if (( GPS_DEVICE_PRESENT_STATE == 1 && GPS_STACK_ACTIVE_STATE == 1 )); then
|
|
return 0
|
|
fi
|
|
return 1
|
|
fi
|
|
GPS_LAST_CHECK_SEC="${now_sec}"
|
|
|
|
mapfile -t devices < <(compgen -G "${device_glob}" || true)
|
|
if (( ${#devices[@]} == 0 )); then
|
|
GPS_DEVICE_PRESENT_STATE=0
|
|
GPS_STACK_ACTIVE_STATE=0
|
|
if (( previous_present != 0 )); then
|
|
blitz_log "${STEP}" "gps-device-check" "failure" "state=missing glob=${device_glob}" 1
|
|
fi
|
|
GPS_DEVICE_PRESENT_PREV=0
|
|
return 1
|
|
fi
|
|
|
|
device_summary="$(IFS=,; printf '%s' "${devices[*]}")"
|
|
GPS_DEVICE_PRESENT_STATE=1
|
|
GPS_DEVICE_PRESENT_PREV=1
|
|
|
|
if (( previous_present == 0 )); then
|
|
blitz_log "${STEP}" "gps-device-check" "success" "state=reappeared devices=${device_summary}" 0
|
|
recovery_reason="device-reappeared"
|
|
elif ! gps_stack_active; then
|
|
recovery_reason="gpsd-inactive"
|
|
fi
|
|
|
|
if [[ -n "${recovery_reason}" ]]; then
|
|
if restart_gps_stack "${recovery_reason}" "${device_summary}"; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
fi
|
|
|
|
GPS_STACK_ACTIVE_STATE=1
|
|
return 0
|
|
}
|
|
|
|
status_file_fresh() {
|
|
local path="$1"
|
|
local max_age_sec="$2"
|
|
local now_sec
|
|
local mtime_sec
|
|
|
|
if [[ ! -f "${path}" ]]; then
|
|
return 1
|
|
fi
|
|
now_sec="$(now_epoch_sec)"
|
|
mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)"
|
|
(( now_sec - mtime_sec <= max_age_sec ))
|
|
}
|
|
|
|
ros_receiver_status_fresh() {
|
|
local path="$1"
|
|
local max_age_sec="$2"
|
|
local now_epoch_ms_value
|
|
|
|
now_epoch_ms_value="$(now_epoch_ms)"
|
|
python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
now_epoch_ms = int(sys.argv[2])
|
|
max_age_ms = int(sys.argv[3]) * 1000
|
|
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as handle:
|
|
payload = json.load(handle)
|
|
except Exception:
|
|
raise SystemExit(1)
|
|
|
|
heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0)
|
|
socket_bound = bool(payload.get("socket_bound"))
|
|
|
|
if heartbeat_ms <= 0 or not socket_bound:
|
|
raise SystemExit(1)
|
|
|
|
raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1)
|
|
PY
|
|
}
|
|
|
|
ros_receiver_healthy() {
|
|
local max_age_sec="$1"
|
|
|
|
service_is_active "${ROS_SERVICE}" \
|
|
&& [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \
|
|
&& status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \
|
|
&& ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}"
|
|
}
|
|
|
|
write_watchdog_status() {
|
|
local fault_reason="$1"
|
|
local recovery_state="$2"
|
|
local network_ok="$3"
|
|
local camera_ok="$4"
|
|
local ros_ok="$5"
|
|
local bside_ok="$6"
|
|
local gps_ok="$7"
|
|
local gps_device_present="$8"
|
|
local tmp_file
|
|
|
|
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
|
|
cat > "${tmp_file}" <<EOF
|
|
{
|
|
"updated_at_epoch_ms": $(now_epoch_ms),
|
|
"fault_reason": "${fault_reason}",
|
|
"recovery_state": "${recovery_state}",
|
|
"network_ok": ${network_ok},
|
|
"camera_ok": ${camera_ok},
|
|
"ros_ok": ${ros_ok},
|
|
"bside_ok": ${bside_ok},
|
|
"gps_ok": ${gps_ok},
|
|
"gps_device_present": ${gps_device_present},
|
|
"network_fail_count": ${NETWORK_FAIL_COUNT},
|
|
"targeted_restart_count": $(targeted_restart_total),
|
|
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
|
|
"last_action": "${LAST_ACTION}",
|
|
"last_action_epoch_ms": ${LAST_ACTION_EPOCH_MS}
|
|
}
|
|
EOF
|
|
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
|
|
}
|
|
|
|
watchdog_emit_json() {
|
|
local record_type="$1"
|
|
local action="$2"
|
|
local fault_reason="$3"
|
|
local recovery_state="$4"
|
|
local detail="$5"
|
|
local incident_id="${6:-}"
|
|
local network_ok="${7:-1}"
|
|
local camera_ok="${8:-1}"
|
|
local ros_ok="${9:-1}"
|
|
local bside_ok="${10:-1}"
|
|
local gps_ok="${11:-1}"
|
|
local gps_device_present="${12:-1}"
|
|
|
|
python3 - "${record_type}" "${action}" "${fault_reason}" "${recovery_state}" "${detail}" "${incident_id}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" "${LAST_ACTION}" "${LAST_ACTION_EPOCH_MS}" "${NETWORK_FAIL_COUNT}" "$(targeted_restart_total)" "${FULL_RESTART_WINDOW_COUNT}" <<'PY'
|
|
import json
|
|
import sys
|
|
import time
|
|
|
|
record_type, action, fault_reason, recovery_state, detail, incident_id, network_ok, camera_ok, ros_ok, bside_ok, gps_ok, gps_device_present, last_action, last_action_epoch_ms, network_fail_count, targeted_restart_count, full_restart_count = sys.argv[1:18]
|
|
payload = {
|
|
"ts_unix_ms": time.time_ns() // 1_000_000,
|
|
"record_type": record_type,
|
|
"action": action,
|
|
"fault_reason": fault_reason,
|
|
"recovery_state": recovery_state,
|
|
"detail": detail,
|
|
"incident_id": incident_id or None,
|
|
"network_ok": network_ok == "1",
|
|
"camera_ok": camera_ok == "1",
|
|
"ros_ok": ros_ok == "1",
|
|
"bside_ok": bside_ok == "1",
|
|
"gps_ok": gps_ok == "1",
|
|
"gps_device_present": gps_device_present == "1",
|
|
"network_fail_count": int(network_fail_count),
|
|
"targeted_restart_count": int(targeted_restart_count),
|
|
"full_restart_count": int(full_restart_count),
|
|
"last_action": last_action,
|
|
"last_action_epoch_ms": int(last_action_epoch_ms or 0),
|
|
}
|
|
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
|
|
PY
|
|
}
|
|
|
|
watchdog_append_event() {
|
|
local line=""
|
|
|
|
[[ -n "${WATCHDOG_EVENT_LOG}" ]] || return 0
|
|
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
|
|
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
|
|
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true
|
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
|
|
fi
|
|
return 0
|
|
fi
|
|
if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then
|
|
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
|
|
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true
|
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
|
|
fi
|
|
return 0
|
|
fi
|
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
|
|
}
|
|
|
|
watchdog_append_sample() {
|
|
local line=""
|
|
|
|
[[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0
|
|
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
|
|
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
|
|
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true
|
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
|
|
fi
|
|
return 0
|
|
fi
|
|
if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then
|
|
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
|
|
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true
|
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
|
|
fi
|
|
return 0
|
|
fi
|
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
|
|
}
|
|
|
|
watchdog_record_state_transition() {
|
|
local fault_reason="$1"
|
|
local recovery_state="$2"
|
|
|
|
if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then
|
|
return 0
|
|
fi
|
|
watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" ""
|
|
LAST_REPORTED_FAULT_REASON="${fault_reason}"
|
|
LAST_REPORTED_RECOVERY_STATE="${recovery_state}"
|
|
}
|
|
|
|
watchdog_launch_incident() {
|
|
local reason="$1"
|
|
local unit_name="$2"
|
|
|
|
"${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" \
|
|
--source watchdog \
|
|
--reason "${reason}" \
|
|
--unit "${unit_name}" \
|
|
--result failure \
|
|
--exit-status 1 2>/dev/null || true
|
|
}
|
|
|
|
set_last_action() {
|
|
LAST_ACTION="$1"
|
|
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
|
|
}
|
|
|
|
targeted_restart_total() {
|
|
local total=0
|
|
local key
|
|
|
|
for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do
|
|
total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] ))
|
|
done
|
|
printf '%s\n' "${total}"
|
|
}
|
|
|
|
register_targeted_restart() {
|
|
local fault_key="$1"
|
|
local now_sec
|
|
local window_start
|
|
local count
|
|
|
|
now_sec="$(now_epoch_sec)"
|
|
window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}"
|
|
count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}"
|
|
if (( window_start == 0 || now_sec - window_start > 60 )); then
|
|
window_start="${now_sec}"
|
|
count=1
|
|
else
|
|
count=$(( count + 1 ))
|
|
fi
|
|
TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}"
|
|
TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}"
|
|
(( count >= 2 ))
|
|
}
|
|
|
|
record_full_restart() {
|
|
local now_sec
|
|
|
|
now_sec="$(now_epoch_sec)"
|
|
if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then
|
|
FULL_RESTART_WINDOW_START="${now_sec}"
|
|
FULL_RESTART_WINDOW_COUNT=1
|
|
else
|
|
FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 ))
|
|
fi
|
|
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
|
|
BACKOFF_UNTIL=$(( now_sec + 60 ))
|
|
watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" ""
|
|
fi
|
|
}
|
|
|
|
restart_bside_targeted() {
|
|
local fault_key="$1"
|
|
local reason="$2"
|
|
local rc
|
|
local incident_id=""
|
|
|
|
if register_targeted_restart "${fault_key}"; then
|
|
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
|
|
watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" ""
|
|
full_restart_stack "${reason}-escalated"
|
|
return 0
|
|
fi
|
|
|
|
incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")"
|
|
set_last_action "restart-bside"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
|
|
watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
|
|
if systemctl restart "${B_SIDE_SERVICE}"; then
|
|
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
|
|
watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
|
|
return 0
|
|
fi
|
|
|
|
rc=$?
|
|
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
|
|
watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}"
|
|
return "${rc}"
|
|
}
|
|
|
|
full_restart_stack() {
|
|
local reason="$1"
|
|
local rc
|
|
local incident_id=""
|
|
|
|
incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")"
|
|
set_last_action "full-restart"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
recovery_state="recovering"
|
|
fault_reason="${reason}"
|
|
|
|
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
|
|
watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}"
|
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
|
|
|
if systemctl restart "${ROS_SERVICE}"; then
|
|
blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
|
|
if bash "${BOOT_SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
|
|
if systemctl start "${B_SIDE_SERVICE}"; then
|
|
blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
|
|
watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}"
|
|
record_full_restart
|
|
}
|
|
|
|
network_fault_injected() {
|
|
[[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]]
|
|
}
|
|
|
|
resolve_network_interface() {
|
|
NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)"
|
|
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
|
|
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${NETWORK_LAST_INTERFACE}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
network_route_targets() {
|
|
local target
|
|
|
|
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
|
|
fi
|
|
for target in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
|
|
if [[ -n "${target}" && "${target}" != "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
printf '%s\n' "${target}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
log_target_route_paths() {
|
|
local action="$1"
|
|
local target
|
|
local route_output
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
route_output="$(ip route get "${target}" 2>&1 | head -n 1 || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
route_output="unresolved"
|
|
fi
|
|
blitz_log "${STEP}" "route-path" "info" "action=${action} target=${target} route=${route_output}" 0
|
|
done < <(network_route_targets)
|
|
}
|
|
|
|
route_output_uses_interface() {
|
|
local route_output="$1"
|
|
local interface_name="$2"
|
|
|
|
[[ -n "${interface_name}" ]] || return 1
|
|
[[ "${route_output}" == *" dev ${interface_name} "* || "${route_output}" == *" dev ${interface_name}" ]]
|
|
}
|
|
|
|
route_output_uses_gateway() {
|
|
local route_output="$1"
|
|
local gateway="$2"
|
|
|
|
[[ -n "${gateway}" ]] || return 1
|
|
[[ "${route_output}" == *"via ${gateway}"* ]]
|
|
}
|
|
|
|
route_is_desired_target_route() {
|
|
local route_output="$1"
|
|
local interface_name="$2"
|
|
local gateway="$3"
|
|
|
|
route_output_uses_interface "${route_output}" "${interface_name}" \
|
|
&& route_output_uses_gateway "${route_output}" "${gateway}"
|
|
}
|
|
|
|
route_is_managed_5g_route() {
|
|
local route_output="$1"
|
|
local interface_name="${2:-}"
|
|
local gateway="${3:-}"
|
|
|
|
if route_output_uses_interface "${route_output}" "${interface_name}"; then
|
|
return 0
|
|
fi
|
|
if route_output_uses_gateway "${route_output}" "${gateway}"; then
|
|
return 0
|
|
fi
|
|
if route_output_uses_gateway "${route_output}" "${BLITZ_5G_GATEWAY:-}"; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
resolve_route_cleanup_interface() {
|
|
local interface_name=""
|
|
local info_json="${BLITZ_5G_INFO_JSON:-}"
|
|
|
|
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
|
|
printf '%s\n' "${NETWORK_LAST_INTERFACE}"
|
|
return 0
|
|
fi
|
|
if [[ -n "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" ]]; then
|
|
printf '%s\n' "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}"
|
|
return 0
|
|
fi
|
|
|
|
interface_name="$(blitz_read_5g_info_interface "${info_json}" || true)"
|
|
if [[ -n "${interface_name}" ]]; then
|
|
printf '%s\n' "${interface_name}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
resolve_network_gateway() {
|
|
local interface_name="$1"
|
|
local default_route
|
|
local gateway=""
|
|
local tokens=()
|
|
local index
|
|
|
|
default_route="$(ip -o route show default dev "${interface_name}" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -n "${default_route}" ]]; then
|
|
read -r -a tokens <<< "${default_route}"
|
|
for (( index=0; index<${#tokens[@]}-1; index++ )); do
|
|
if [[ "${tokens[index]}" == "via" ]]; then
|
|
gateway="${tokens[index + 1]}"
|
|
break
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [[ -n "${gateway}" ]]; then
|
|
printf '%s\n' "${gateway}"
|
|
return 0
|
|
fi
|
|
if [[ -n "${BLITZ_5G_GATEWAY:-}" ]]; then
|
|
printf '%s\n' "${BLITZ_5G_GATEWAY}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
sync_target_routes_to_5g() {
|
|
local interface_name="$1"
|
|
local gateway="${2:-}"
|
|
local route_output=""
|
|
local updated=0
|
|
local target
|
|
local rc
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
if [[ -z "${gateway}" ]]; then
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
fi
|
|
if [[ -z "${gateway}" ]]; then
|
|
blitz_log "${STEP}" "route-sync-gateway" "failure" "interface=${interface_name}" 1
|
|
return 1
|
|
fi
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -n "${route_output}" ]] && route_is_desired_target_route "${route_output}" "${interface_name}" "${gateway}"; then
|
|
continue
|
|
fi
|
|
if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then
|
|
updated=1
|
|
blitz_log "${STEP}" "route-sync-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "route-sync-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
done < <(network_route_targets)
|
|
|
|
if (( updated == 1 )); then
|
|
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${interface_name}"
|
|
log_target_route_paths "sync-to-5g"
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
clear_target_routes_from_5g() {
|
|
local interface_name="${1:-}"
|
|
local gateway="${2:-}"
|
|
local route_output=""
|
|
local target
|
|
local removed_any=0
|
|
local rc
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
interface_name="$(resolve_route_cleanup_interface || true)"
|
|
fi
|
|
if [[ -z "${gateway}" && -n "${interface_name}" ]]; then
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
fi
|
|
if [[ -z "${gateway}" ]]; then
|
|
gateway="${BLITZ_5G_GATEWAY:-}"
|
|
fi
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -z "${route_output}" ]] || ! route_is_managed_5g_route "${route_output}" "${interface_name}" "${gateway}"; then
|
|
continue
|
|
fi
|
|
if ip route del "${target}/32"; then
|
|
removed_any=1
|
|
blitz_log "${STEP}" "route-clear-target" "success" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "route-clear-target" "failure" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
done < <(network_route_targets)
|
|
|
|
if (( removed_any == 1 )); then
|
|
blitz_log "${STEP}" "route-clear" "success" "interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
|
|
log_target_route_paths "clear-from-5g"
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
repair_network_routes() {
|
|
local interface_name="$1"
|
|
local gateway=""
|
|
local route_output
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
if [[ -z "${gateway}" ]]; then
|
|
blitz_log "${STEP}" "route-repair-gateway" "failure" "interface=${interface_name}" 1
|
|
return 1
|
|
fi
|
|
|
|
if ! sync_target_routes_to_5g "${interface_name}" "${gateway}"; then
|
|
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
|
|
return 1
|
|
fi
|
|
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
|
|
blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1
|
|
return 1
|
|
fi
|
|
|
|
if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then
|
|
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
|
|
blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1
|
|
return 1
|
|
fi
|
|
|
|
blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0
|
|
return 0
|
|
}
|
|
|
|
network_is_healthy() {
|
|
local route_output
|
|
|
|
NETWORK_LAST_INTERFACE=""
|
|
if network_fault_injected; then
|
|
return 1
|
|
fi
|
|
if ! resolve_network_interface; then
|
|
return 1
|
|
fi
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
return 1
|
|
fi
|
|
ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
|
|
}
|
|
|
|
fallback_network_is_healthy() {
|
|
local route_output
|
|
|
|
if [[ -z "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
ping -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
|
|
}
|
|
|
|
wait_for_network_recovery() {
|
|
local timeout_sec="$1"
|
|
local waited=0
|
|
|
|
while (( waited < timeout_sec )); do
|
|
if network_is_healthy; then
|
|
blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0
|
|
return 0
|
|
fi
|
|
if (( waited == 0 || waited % 5 == 0 )); then
|
|
blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0
|
|
fi
|
|
sleep 1
|
|
waited=$(( waited + 1 ))
|
|
done
|
|
|
|
blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1
|
|
return 1
|
|
}
|
|
|
|
perform_network_recovery() {
|
|
local rc=0
|
|
local incident_id=""
|
|
|
|
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
|
set_last_action "route-repair"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
|
NETWORK_FAIL_COUNT=0
|
|
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
|
|
watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" ""
|
|
return 0
|
|
fi
|
|
|
|
incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")"
|
|
set_last_action "network-recovery"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
|
|
watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}"
|
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
|
|
|
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
|
|
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}"
|
|
return "${rc}"
|
|
fi
|
|
|
|
if wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
|
|
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}"
|
|
return "${rc}"
|
|
fi
|
|
|
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
|
NETWORK_FAIL_COUNT=0
|
|
watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}"
|
|
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
|
restart_bside_targeted "network" "network-recovered"
|
|
return 0
|
|
fi
|
|
full_restart_stack "network-recovered-ros-unhealthy"
|
|
return 0
|
|
}
|
|
|
|
blitz_load_boot_env
|
|
blitz_require_root "${STEP}"
|
|
blitz_require_command systemctl "${STEP}"
|
|
blitz_require_command stat "${STEP}"
|
|
blitz_require_command ping "${STEP}"
|
|
blitz_require_command python3 "${STEP}"
|
|
blitz_prepare_runtime_dir
|
|
blitz_require_run_context
|
|
|
|
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
|
|
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
|
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
|
|
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
|
|
WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl"
|
|
WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl"
|
|
|
|
while true; do
|
|
fault_reason="none"
|
|
recovery_state="ok"
|
|
network_ok=1
|
|
camera_ok=1
|
|
ros_ok=1
|
|
bside_ok=1
|
|
gps_ok=1
|
|
gps_device_present=1
|
|
RECOVERY_ACTION_TAKEN=0
|
|
now_sec="$(now_epoch_sec)"
|
|
|
|
if gps_monitor_enabled; then
|
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
|
if (( GPS_DEVICE_PRESENT_STATE == 0 || GPS_STACK_ACTIVE_STATE == 0 )); then
|
|
gps_ok=0
|
|
fi
|
|
fi
|
|
|
|
if (( BACKOFF_UNTIL > now_sec )); then
|
|
fault_reason="backoff"
|
|
recovery_state="backoff"
|
|
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
|
|
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
|
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
|
continue
|
|
fi
|
|
|
|
if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then
|
|
recovery_state="recovering"
|
|
elif ! network_is_healthy; then
|
|
clear_target_routes_from_5g || true
|
|
if fallback_network_is_healthy; then
|
|
NETWORK_FAIL_COUNT=0
|
|
fault_reason="network_fallback_active"
|
|
recovery_state="degraded"
|
|
blitz_log "${STEP}" "network-check" "fallback" "interface=${NETWORK_LAST_INTERFACE:-unresolved} target=${BLITZ_TIME_SERVER_IP}" 0
|
|
if (( NETWORK_PRIMARY_LAST_RETRY_SEC == 0 || now_sec - NETWORK_PRIMARY_LAST_RETRY_SEC >= 10 )); then
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC="${now_sec}"
|
|
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC=0
|
|
fault_reason="none"
|
|
recovery_state="ok"
|
|
blitz_log "${STEP}" "network-check" "primary-restored" "interface=${NETWORK_LAST_INTERFACE} target=${BLITZ_TIME_SERVER_IP}" 0
|
|
log_target_route_paths "primary-restored"
|
|
fi
|
|
fi
|
|
else
|
|
network_ok=0
|
|
NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 ))
|
|
fault_reason="network_or_robot_unreachable"
|
|
recovery_state="recovering"
|
|
blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1
|
|
if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then
|
|
perform_network_recovery || true
|
|
fi
|
|
fi
|
|
else
|
|
NETWORK_PRIMARY_LAST_RETRY_SEC=0
|
|
NETWORK_FAIL_COUNT=0
|
|
sync_target_routes_to_5g "${NETWORK_LAST_INTERFACE}" || true
|
|
fi
|
|
|
|
if check_gps_health "${now_sec}"; then
|
|
gps_ok=1
|
|
else
|
|
gps_ok=0
|
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
|
if [[ "${fault_reason}" == "none" ]]; then
|
|
if (( GPS_DEVICE_PRESENT_STATE == 0 )); then
|
|
fault_reason="gps_device_missing"
|
|
else
|
|
fault_reason="gps_reconnect_failed"
|
|
fi
|
|
recovery_state="degraded"
|
|
fi
|
|
fi
|
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
|
|
|
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
|
|
camera_ok=0
|
|
fault_reason="camera_missing"
|
|
recovery_state="degraded"
|
|
CAMERA_MISSING_PREV=1
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then
|
|
CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 ))
|
|
recovery_state="recovering"
|
|
fault_reason="camera_recovered"
|
|
if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then
|
|
restart_bside_targeted "camera" "camera-reappeared" || true
|
|
CAMERA_MISSING_PREV=0
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
fi
|
|
else
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
fi
|
|
|
|
if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then
|
|
bside_ok=0
|
|
fault_reason="bside_status_stale"
|
|
recovery_state="recovering"
|
|
restart_bside_targeted "bside" "bside-unhealthy" || true
|
|
fi
|
|
|
|
if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
|
ros_ok=0
|
|
fault_reason="ros_receiver_unhealthy"
|
|
recovery_state="recovering"
|
|
full_restart_stack "ros-unhealthy" || true
|
|
fi
|
|
|
|
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
|
|
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
|
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
|
done
|