diff --git a/scripts/boot/README.md b/scripts/boot/README.md index a10a5bb..ab75d33 100644 --- a/scripts/boot/README.md +++ b/scripts/boot/README.md @@ -207,6 +207,8 @@ sudo bash scripts/boot/blitz-fault-inject.sh network-down off - wait for unix socket - start `b_side` - If network checks fail repeatedly, watchdog stops `b_side`, runs `5g-dial.sh`, waits for route recovery, and then restores services. +- While 5G is healthy, watchdog keeps every host route listed by `BLITZ_TIME_SERVER_IP` and `BLITZ_5G_ROUTE_TARGETS` pinned to the resolved 5G interface. When 5G becomes unhealthy, watchdog deletes those host routes so traffic can fall back to the remaining default network path. If that fallback path is still reachable, watchdog keeps `b_side_omnid` running instead of treating it as a full network outage. +- Whenever watchdog changes or restores those host routes, it logs `route-path` lines for each target so you can see which interface Linux currently chooses for `81.70.156.140`, `106.55.173.235`, and any other configured 5G-pinned target. - If GPS monitoring is enabled, watchdog checks `BLITZ_GPS_DEVICE_GLOB` every `BLITZ_GPS_CHECK_INTERVAL_SEC` seconds. When the GPS serial device disappears and later reappears, watchdog restarts the units in `BLITZ_GPS_RESTART_UNITS` so `gpsd` can bind to the new device node again. - Camera disappearance is logged as degraded state. Reappearance triggers a `b_side` restart after the device is stable. diff --git a/scripts/boot/blitz-watchdog.sh b/scripts/boot/blitz-watchdog.sh index 2ebd9b1..117f026 100644 --- a/scripts/boot/blitz-watchdog.sh +++ b/scripts/boot/blitz-watchdog.sh @@ -22,6 +22,8 @@ LAST_ACTION_EPOCH_MS=0 FULL_RESTART_WINDOW_START=0 FULL_RESTART_WINDOW_COUNT=0 NETWORK_LAST_INTERFACE="" +NETWORK_ROUTE_INTERFACE_LAST_KNOWN="" +NETWORK_PRIMARY_LAST_RETRY_SEC=0 GPS_LAST_CHECK_SEC=0 GPS_DEVICE_PRESENT_PREV=-1 GPS_DEVICE_PRESENT_STATE=1 @@ -329,7 +331,7 @@ full_restart_stack() { return "${rc}" fi - if bash "${SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then + if bash "${BOOT_SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then : else rc=$? @@ -355,7 +357,11 @@ network_fault_injected() { resolve_network_interface() { NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)" - [[ -n "${NETWORK_LAST_INTERFACE}" ]] + if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then + NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${NETWORK_LAST_INTERFACE}" + return 0 + fi + return 1 } network_route_targets() { @@ -371,6 +377,84 @@ network_route_targets() { done } +log_target_route_paths() { + local action="$1" + local target + local route_output + + while IFS= read -r target; do + [[ -n "${target}" ]] || continue + route_output="$(ip route get "${target}" 2>&1 | head -n 1 || true)" + if [[ -z "${route_output}" ]]; then + route_output="unresolved" + fi + blitz_log "${STEP}" "route-path" "info" "action=${action} target=${target} route=${route_output}" 0 + done < <(network_route_targets) +} + +route_output_uses_interface() { + local route_output="$1" + local interface_name="$2" + + [[ -n "${interface_name}" ]] || return 1 + [[ "${route_output}" == *" dev ${interface_name} "* || "${route_output}" == *" dev ${interface_name}" ]] +} + +route_output_uses_gateway() { + local route_output="$1" + local gateway="$2" + + [[ -n "${gateway}" ]] || return 1 + [[ "${route_output}" == *"via ${gateway}"* ]] +} + +route_is_desired_target_route() { + local route_output="$1" + local interface_name="$2" + local gateway="$3" + + route_output_uses_interface "${route_output}" "${interface_name}" \ + && route_output_uses_gateway "${route_output}" "${gateway}" +} + +route_is_managed_5g_route() { + local route_output="$1" + local interface_name="${2:-}" + local gateway="${3:-}" + + if route_output_uses_interface "${route_output}" "${interface_name}"; then + return 0 + fi + if route_output_uses_gateway "${route_output}" "${gateway}"; then + return 0 + fi + if route_output_uses_gateway "${route_output}" "${BLITZ_5G_GATEWAY:-}"; then + return 0 + fi + return 1 +} + +resolve_route_cleanup_interface() { + local interface_name="" + local info_json="${BLITZ_5G_INFO_JSON:-}" + + if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then + printf '%s\n' "${NETWORK_LAST_INTERFACE}" + return 0 + fi + if [[ -n "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" ]]; then + printf '%s\n' "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" + return 0 + fi + + interface_name="$(blitz_read_5g_info_interface "${info_json}" || true)" + if [[ -n "${interface_name}" ]]; then + printf '%s\n' "${interface_name}" + return 0 + fi + return 1 +} + resolve_network_gateway() { local interface_name="$1" local default_route @@ -400,18 +484,96 @@ resolve_network_gateway() { return 1 } -repair_network_routes() { +sync_target_routes_to_5g() { local interface_name="$1" - local gateway="" + local gateway="${2:-}" + local route_output="" + local updated=0 local target - local route_output local rc if [[ -z "${interface_name}" ]]; then return 1 fi - if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then - blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1 + + if [[ -z "${gateway}" ]]; then + gateway="$(resolve_network_gateway "${interface_name}" || true)" + fi + if [[ -z "${gateway}" ]]; then + blitz_log "${STEP}" "route-sync-gateway" "failure" "interface=${interface_name}" 1 + return 1 + fi + + while IFS= read -r target; do + [[ -n "${target}" ]] || continue + route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)" + if [[ -n "${route_output}" ]] && route_is_desired_target_route "${route_output}" "${interface_name}" "${gateway}"; then + continue + fi + if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then + updated=1 + blitz_log "${STEP}" "route-sync-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0 + else + rc=$? + blitz_log "${STEP}" "route-sync-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}" + return "${rc}" + fi + done < <(network_route_targets) + + if (( updated == 1 )); then + NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${interface_name}" + log_target_route_paths "sync-to-5g" + fi + return 0 +} + +clear_target_routes_from_5g() { + local interface_name="${1:-}" + local gateway="${2:-}" + local route_output="" + local target + local removed_any=0 + local rc + + if [[ -z "${interface_name}" ]]; then + interface_name="$(resolve_route_cleanup_interface || true)" + fi + if [[ -z "${gateway}" && -n "${interface_name}" ]]; then + gateway="$(resolve_network_gateway "${interface_name}" || true)" + fi + if [[ -z "${gateway}" ]]; then + gateway="${BLITZ_5G_GATEWAY:-}" + fi + + while IFS= read -r target; do + [[ -n "${target}" ]] || continue + route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)" + if [[ -z "${route_output}" ]] || ! route_is_managed_5g_route "${route_output}" "${interface_name}" "${gateway}"; then + continue + fi + if ip route del "${target}/32"; then + removed_any=1 + blitz_log "${STEP}" "route-clear-target" "success" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0 + else + rc=$? + blitz_log "${STEP}" "route-clear-target" "failure" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" "${rc}" + return "${rc}" + fi + done < <(network_route_targets) + + if (( removed_any == 1 )); then + blitz_log "${STEP}" "route-clear" "success" "interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0 + log_target_route_paths "clear-from-5g" + fi + return 0 +} + +repair_network_routes() { + local interface_name="$1" + local gateway="" + local route_output + + if [[ -z "${interface_name}" ]]; then return 1 fi @@ -421,23 +583,24 @@ repair_network_routes() { return 1 fi - while IFS= read -r target; do - [[ -n "${target}" ]] || continue - if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then - blitz_log "${STEP}" "route-repair-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0 - else - rc=$? - blitz_log "${STEP}" "route-repair-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}" - return "${rc}" - fi - done < <(network_route_targets) + if ! sync_target_routes_to_5g "${interface_name}" "${gateway}"; then + clear_target_routes_from_5g "${interface_name}" "${gateway}" || true + return 1 + fi route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)" if [[ -z "${route_output}" ]]; then + clear_target_routes_from_5g "${interface_name}" "${gateway}" || true blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1 return 1 fi + if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then + clear_target_routes_from_5g "${interface_name}" "${gateway}" || true + blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1 + return 1 + fi + blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0 return 0 } @@ -459,6 +622,21 @@ network_is_healthy() { ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1 } +fallback_network_is_healthy() { + local route_output + + if [[ -z "${BLITZ_TIME_SERVER_IP:-}" ]]; then + return 1 + fi + + route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" || true)" + if [[ -z "${route_output}" ]]; then + return 1 + fi + + ping -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1 +} + wait_for_network_recovery() { local timeout_sec="$1" local waited=0 @@ -496,11 +674,11 @@ perform_network_recovery() { blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0 systemctl stop "${B_SIDE_SERVICE}" || true - if bash "${SCRIPT_DIR}/5g-dial.sh"; then + if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then : else rc=$? - blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${SCRIPT_DIR}/5g-dial.sh" "${rc}" + blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}" return "${rc}" fi @@ -565,16 +743,36 @@ while true; do if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then recovery_state="recovering" elif ! network_is_healthy; then - network_ok=0 - NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 )) - fault_reason="network_or_robot_unreachable" - recovery_state="recovering" - blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1 - if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then - perform_network_recovery || true + clear_target_routes_from_5g || true + if fallback_network_is_healthy; then + NETWORK_FAIL_COUNT=0 + fault_reason="network_fallback_active" + recovery_state="degraded" + blitz_log "${STEP}" "network-check" "fallback" "interface=${NETWORK_LAST_INTERFACE:-unresolved} target=${BLITZ_TIME_SERVER_IP}" 0 + if (( NETWORK_PRIMARY_LAST_RETRY_SEC == 0 || now_sec - NETWORK_PRIMARY_LAST_RETRY_SEC >= 10 )); then + NETWORK_PRIMARY_LAST_RETRY_SEC="${now_sec}" + if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then + NETWORK_PRIMARY_LAST_RETRY_SEC=0 + fault_reason="none" + recovery_state="ok" + blitz_log "${STEP}" "network-check" "primary-restored" "interface=${NETWORK_LAST_INTERFACE} target=${BLITZ_TIME_SERVER_IP}" 0 + log_target_route_paths "primary-restored" + fi + fi + else + network_ok=0 + NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 )) + fault_reason="network_or_robot_unreachable" + recovery_state="recovering" + blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1 + if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then + perform_network_recovery || true + fi fi else + NETWORK_PRIMARY_LAST_RETRY_SEC=0 NETWORK_FAIL_COUNT=0 + sync_target_routes_to_5g "${NETWORK_LAST_INTERFACE}" || true fi if check_gps_health "${now_sec}"; then diff --git a/scripts/boot/modem_network_info.json b/scripts/boot/modem_network_info.json index e7b4a1e..35f08f5 100644 --- a/scripts/boot/modem_network_info.json +++ b/scripts/boot/modem_network_info.json @@ -1,9 +1,9 @@ { - "interface": "enxd41a57335f9d", + "interface": "enxb8f72c9e179a", "ipv4": [ - "192.168.225.83/22" + "192.168.225.160/22" ], "ipv6": [ - "fe80::18c1:e89d:e033:9857/64" + "fe80::52ae:a1c8:a9bb:a9a8/64" ] } \ No newline at end of file diff --git a/scripts/dev/load-env.sh b/scripts/dev/load-env.sh index 1c531ff..306fdc8 100644 --- a/scripts/dev/load-env.sh +++ b/scripts/dev/load-env.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -DEFAULT_OMNISOCKETGO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOAD_ENV_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEFAULT_OMNISOCKETGO_ROOT="$(cd "${LOAD_ENV_SCRIPT_DIR}/../.." && pwd)" die() { echo "$*" >&2 @@ -21,7 +21,7 @@ is_robot_command_center_root() { require_robot_command_center_root() { if ! is_robot_command_center_root "${ROBOT_COMMAND_CENTER_ROOT}"; then - die "ROBOT_COMMAND_CENTER_ROOT must point to the robot-command-center repo root. Current value: ${ROBOT_COMMAND_CENTER_ROOT}. Set it in ${SCRIPT_DIR}/robot-remote.env.local if needed." + die "ROBOT_COMMAND_CENTER_ROOT must point to the robot-command-center repo root. Current value: ${ROBOT_COMMAND_CENTER_ROOT}. Set it in ${LOAD_ENV_SCRIPT_DIR}/robot-remote.env.local if needed." fi } @@ -55,8 +55,8 @@ if [[ "${OMNI_CAMERA_VERIFY+x}" == "x" ]]; then fi ENV_FILES=( - "${SCRIPT_DIR}/robot-remote.env" - "${SCRIPT_DIR}/robot-remote.env.local" + "${LOAD_ENV_SCRIPT_DIR}/robot-remote.env" + "${LOAD_ENV_SCRIPT_DIR}/robot-remote.env.local" ) for env_file in "${ENV_FILES[@]}"; do