492 lines
14 KiB
Bash
492 lines
14 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck disable=SC1091
|
|
source "${SCRIPT_DIR}/common.sh"
|
|
|
|
STEP="watchdog"
|
|
B_SIDE_SERVICE="blitz-b-side-omnid.service"
|
|
ROS_SERVICE="blitz-ros-receiver.service"
|
|
B_SIDE_STATUS_FILE=""
|
|
ROS_STATUS_FILE=""
|
|
WATCHDOG_STATUS_FILE=""
|
|
NETWORK_FAULT_FILE=""
|
|
CAMERA_MISSING_PREV=0
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
NETWORK_FAIL_COUNT=0
|
|
NETWORK_COOLDOWN_UNTIL=0
|
|
BACKOFF_UNTIL=0
|
|
LAST_ACTION="none"
|
|
LAST_ACTION_EPOCH_MS=0
|
|
FULL_RESTART_WINDOW_START=0
|
|
FULL_RESTART_WINDOW_COUNT=0
|
|
NETWORK_LAST_INTERFACE=""
|
|
declare -A TARGETED_RESTART_WINDOW_START=()
|
|
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
|
|
|
now_epoch_sec() {
|
|
date +%s
|
|
}
|
|
|
|
now_epoch_ms() {
|
|
date +%s%3N
|
|
}
|
|
|
|
service_is_active() {
|
|
systemctl is-active --quiet "$1"
|
|
}
|
|
|
|
status_file_fresh() {
|
|
local path="$1"
|
|
local max_age_sec="$2"
|
|
local now_sec
|
|
local mtime_sec
|
|
|
|
if [[ ! -f "${path}" ]]; then
|
|
return 1
|
|
fi
|
|
now_sec="$(now_epoch_sec)"
|
|
mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)"
|
|
(( now_sec - mtime_sec <= max_age_sec ))
|
|
}
|
|
|
|
ros_receiver_status_fresh() {
|
|
local path="$1"
|
|
local max_age_sec="$2"
|
|
local now_epoch_ms_value
|
|
|
|
now_epoch_ms_value="$(now_epoch_ms)"
|
|
python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
now_epoch_ms = int(sys.argv[2])
|
|
max_age_ms = int(sys.argv[3]) * 1000
|
|
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as handle:
|
|
payload = json.load(handle)
|
|
except Exception:
|
|
raise SystemExit(1)
|
|
|
|
heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0)
|
|
socket_bound = bool(payload.get("socket_bound"))
|
|
|
|
if heartbeat_ms <= 0 or not socket_bound:
|
|
raise SystemExit(1)
|
|
|
|
raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1)
|
|
PY
|
|
}
|
|
|
|
ros_receiver_healthy() {
|
|
local max_age_sec="$1"
|
|
|
|
service_is_active "${ROS_SERVICE}" \
|
|
&& [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \
|
|
&& status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \
|
|
&& ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}"
|
|
}
|
|
|
|
write_watchdog_status() {
|
|
local fault_reason="$1"
|
|
local recovery_state="$2"
|
|
local network_ok="$3"
|
|
local camera_ok="$4"
|
|
local ros_ok="$5"
|
|
local bside_ok="$6"
|
|
local tmp_file
|
|
|
|
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
|
|
cat > "${tmp_file}" <<EOF
|
|
{
|
|
"updated_at_epoch_ms": $(now_epoch_ms),
|
|
"fault_reason": "${fault_reason}",
|
|
"recovery_state": "${recovery_state}",
|
|
"network_ok": ${network_ok},
|
|
"camera_ok": ${camera_ok},
|
|
"ros_ok": ${ros_ok},
|
|
"bside_ok": ${bside_ok},
|
|
"network_fail_count": ${NETWORK_FAIL_COUNT},
|
|
"targeted_restart_count": $(targeted_restart_total),
|
|
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
|
|
"last_action": "${LAST_ACTION}",
|
|
"last_action_epoch_ms": ${LAST_ACTION_EPOCH_MS}
|
|
}
|
|
EOF
|
|
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
|
|
}
|
|
|
|
set_last_action() {
|
|
LAST_ACTION="$1"
|
|
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
|
|
}
|
|
|
|
targeted_restart_total() {
|
|
local total=0
|
|
local key
|
|
|
|
for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do
|
|
total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] ))
|
|
done
|
|
printf '%s\n' "${total}"
|
|
}
|
|
|
|
register_targeted_restart() {
|
|
local fault_key="$1"
|
|
local now_sec
|
|
local window_start
|
|
local count
|
|
|
|
now_sec="$(now_epoch_sec)"
|
|
window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}"
|
|
count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}"
|
|
if (( window_start == 0 || now_sec - window_start > 60 )); then
|
|
window_start="${now_sec}"
|
|
count=1
|
|
else
|
|
count=$(( count + 1 ))
|
|
fi
|
|
TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}"
|
|
TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}"
|
|
(( count >= 2 ))
|
|
}
|
|
|
|
record_full_restart() {
|
|
local now_sec
|
|
|
|
now_sec="$(now_epoch_sec)"
|
|
if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then
|
|
FULL_RESTART_WINDOW_START="${now_sec}"
|
|
FULL_RESTART_WINDOW_COUNT=1
|
|
else
|
|
FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 ))
|
|
fi
|
|
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
|
|
BACKOFF_UNTIL=$(( now_sec + 60 ))
|
|
fi
|
|
}
|
|
|
|
restart_bside_targeted() {
|
|
local fault_key="$1"
|
|
local reason="$2"
|
|
local rc
|
|
|
|
if register_targeted_restart "${fault_key}"; then
|
|
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
|
|
full_restart_stack "${reason}-escalated"
|
|
return 0
|
|
fi
|
|
|
|
set_last_action "restart-bside"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
|
|
if systemctl restart "${B_SIDE_SERVICE}"; then
|
|
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
|
|
return 0
|
|
fi
|
|
|
|
rc=$?
|
|
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
|
|
return "${rc}"
|
|
}
|
|
|
|
full_restart_stack() {
|
|
local reason="$1"
|
|
local rc
|
|
|
|
set_last_action "full-restart"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
recovery_state="recovering"
|
|
fault_reason="${reason}"
|
|
|
|
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
|
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
|
|
|
if systemctl restart "${ROS_SERVICE}"; then
|
|
blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
|
|
if bash "${SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
|
|
if systemctl start "${B_SIDE_SERVICE}"; then
|
|
blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
|
|
record_full_restart
|
|
return "${rc}"
|
|
fi
|
|
record_full_restart
|
|
}
|
|
|
|
network_fault_injected() {
|
|
[[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]]
|
|
}
|
|
|
|
resolve_network_interface() {
|
|
NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)"
|
|
[[ -n "${NETWORK_LAST_INTERFACE}" ]]
|
|
}
|
|
|
|
network_route_targets() {
|
|
local target
|
|
|
|
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
|
|
fi
|
|
for target in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
|
|
if [[ -n "${target}" && "${target}" != "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
|
printf '%s\n' "${target}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
resolve_network_gateway() {
|
|
local interface_name="$1"
|
|
local default_route
|
|
local gateway=""
|
|
local tokens=()
|
|
local index
|
|
|
|
default_route="$(ip -o route show default dev "${interface_name}" 2>/dev/null | head -n 1 || true)"
|
|
if [[ -n "${default_route}" ]]; then
|
|
read -r -a tokens <<< "${default_route}"
|
|
for (( index=0; index<${#tokens[@]}-1; index++ )); do
|
|
if [[ "${tokens[index]}" == "via" ]]; then
|
|
gateway="${tokens[index + 1]}"
|
|
break
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [[ -n "${gateway}" ]]; then
|
|
printf '%s\n' "${gateway}"
|
|
return 0
|
|
fi
|
|
if [[ -n "${BLITZ_5G_GATEWAY:-}" ]]; then
|
|
printf '%s\n' "${BLITZ_5G_GATEWAY}"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
repair_network_routes() {
|
|
local interface_name="$1"
|
|
local gateway=""
|
|
local target
|
|
local route_output
|
|
local rc
|
|
|
|
if [[ -z "${interface_name}" ]]; then
|
|
return 1
|
|
fi
|
|
if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then
|
|
blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1
|
|
return 1
|
|
fi
|
|
|
|
gateway="$(resolve_network_gateway "${interface_name}" || true)"
|
|
if [[ -z "${gateway}" ]]; then
|
|
blitz_log "${STEP}" "route-repair-gateway" "failure" "interface=${interface_name}" 1
|
|
return 1
|
|
fi
|
|
|
|
while IFS= read -r target; do
|
|
[[ -n "${target}" ]] || continue
|
|
if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then
|
|
blitz_log "${STEP}" "route-repair-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "route-repair-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
done < <(network_route_targets)
|
|
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1
|
|
return 1
|
|
fi
|
|
|
|
blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0
|
|
return 0
|
|
}
|
|
|
|
network_is_healthy() {
|
|
local route_output
|
|
|
|
NETWORK_LAST_INTERFACE=""
|
|
if network_fault_injected; then
|
|
return 1
|
|
fi
|
|
if ! resolve_network_interface; then
|
|
return 1
|
|
fi
|
|
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)"
|
|
if [[ -z "${route_output}" ]]; then
|
|
return 1
|
|
fi
|
|
ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
|
|
}
|
|
|
|
wait_for_network_recovery() {
|
|
local timeout_sec="$1"
|
|
local waited=0
|
|
|
|
while (( waited < timeout_sec )); do
|
|
if network_is_healthy; then
|
|
blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0
|
|
return 0
|
|
fi
|
|
if (( waited == 0 || waited % 5 == 0 )); then
|
|
blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0
|
|
fi
|
|
sleep 1
|
|
waited=$(( waited + 1 ))
|
|
done
|
|
|
|
blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1
|
|
return 1
|
|
}
|
|
|
|
perform_network_recovery() {
|
|
local rc=0
|
|
|
|
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
|
set_last_action "route-repair"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
|
NETWORK_FAIL_COUNT=0
|
|
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
|
|
return 0
|
|
fi
|
|
|
|
set_last_action "network-recovery"
|
|
RECOVERY_ACTION_TAKEN=1
|
|
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
|
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
|
|
|
if bash "${SCRIPT_DIR}/5g-dial.sh"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${SCRIPT_DIR}/5g-dial.sh" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
|
|
if wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then
|
|
:
|
|
else
|
|
rc=$?
|
|
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
|
|
return "${rc}"
|
|
fi
|
|
|
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
|
NETWORK_FAIL_COUNT=0
|
|
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
|
restart_bside_targeted "network" "network-recovered"
|
|
return 0
|
|
fi
|
|
full_restart_stack "network-recovered-ros-unhealthy"
|
|
return 0
|
|
}
|
|
|
|
blitz_load_boot_env
|
|
blitz_require_root "${STEP}"
|
|
blitz_require_command systemctl "${STEP}"
|
|
blitz_require_command stat "${STEP}"
|
|
blitz_require_command ping "${STEP}"
|
|
blitz_require_command python3 "${STEP}"
|
|
blitz_prepare_runtime_dir
|
|
|
|
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
|
|
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
|
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
|
|
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
|
|
|
|
while true; do
|
|
fault_reason="none"
|
|
recovery_state="ok"
|
|
network_ok=1
|
|
camera_ok=1
|
|
ros_ok=1
|
|
bside_ok=1
|
|
RECOVERY_ACTION_TAKEN=0
|
|
now_sec="$(now_epoch_sec)"
|
|
|
|
if (( BACKOFF_UNTIL > now_sec )); then
|
|
fault_reason="backoff"
|
|
recovery_state="backoff"
|
|
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0
|
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
|
continue
|
|
fi
|
|
|
|
if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then
|
|
recovery_state="recovering"
|
|
elif ! network_is_healthy; then
|
|
network_ok=0
|
|
NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 ))
|
|
fault_reason="network_or_robot_unreachable"
|
|
recovery_state="recovering"
|
|
blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1
|
|
if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then
|
|
perform_network_recovery || true
|
|
fi
|
|
else
|
|
NETWORK_FAIL_COUNT=0
|
|
fi
|
|
|
|
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
|
|
camera_ok=0
|
|
fault_reason="camera_missing"
|
|
recovery_state="degraded"
|
|
CAMERA_MISSING_PREV=1
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then
|
|
CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 ))
|
|
recovery_state="recovering"
|
|
fault_reason="camera_recovered"
|
|
if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then
|
|
restart_bside_targeted "camera" "camera-reappeared" || true
|
|
CAMERA_MISSING_PREV=0
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
fi
|
|
else
|
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
|
fi
|
|
|
|
if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then
|
|
bside_ok=0
|
|
fault_reason="bside_status_stale"
|
|
recovery_state="recovering"
|
|
restart_bside_targeted "bside" "bside-unhealthy" || true
|
|
fi
|
|
|
|
if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
|
ros_ok=0
|
|
fault_reason="ros_receiver_unhealthy"
|
|
recovery_state="recovering"
|
|
full_restart_stack "ros-unhealthy" || true
|
|
fi
|
|
|
|
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}"
|
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
|
done
|