feat:新增gps监控服务(断开重连)
This commit is contained in:
@@ -77,6 +77,10 @@ BLITZ_HEALTH_STALE_SEC="15"
|
|||||||
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
|
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
|
||||||
BLITZ_NETWORK_FAIL_THRESHOLD="3"
|
BLITZ_NETWORK_FAIL_THRESHOLD="3"
|
||||||
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
|
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
|
||||||
|
BLITZ_GPS_MONITOR_ENABLED="1"
|
||||||
|
BLITZ_GPS_DEVICE_GLOB="/dev/ttyCH341USB*"
|
||||||
|
BLITZ_GPS_CHECK_INTERVAL_SEC="10"
|
||||||
|
BLITZ_GPS_RESTART_UNITS="gpsd.socket gpsd.service"
|
||||||
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
|
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -157,6 +161,8 @@ Key files:
|
|||||||
- `ros-receiver.status.json`
|
- `ros-receiver.status.json`
|
||||||
- `watchdog.status.json`
|
- `watchdog.status.json`
|
||||||
|
|
||||||
|
`watchdog.status.json` now also records `gps_ok` and `gps_device_present` so you can quickly tell whether the GPS USB serial node is currently visible and whether the last `gpsd` reconnect attempt succeeded.
|
||||||
|
|
||||||
Pretty-print them:
|
Pretty-print them:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -201,6 +207,7 @@ sudo bash scripts/boot/blitz-fault-inject.sh network-down off
|
|||||||
- wait for unix socket
|
- wait for unix socket
|
||||||
- start `b_side`
|
- start `b_side`
|
||||||
- If network checks fail repeatedly, watchdog stops `b_side`, runs `5g-dial.sh`, waits for route recovery, and then restores services.
|
- If network checks fail repeatedly, watchdog stops `b_side`, runs `5g-dial.sh`, waits for route recovery, and then restores services.
|
||||||
|
- If GPS monitoring is enabled, watchdog checks `BLITZ_GPS_DEVICE_GLOB` every `BLITZ_GPS_CHECK_INTERVAL_SEC` seconds. When the GPS serial device disappears and later reappears, watchdog restarts the units in `BLITZ_GPS_RESTART_UNITS` so `gpsd` can bind to the new device node again.
|
||||||
- Camera disappearance is logged as degraded state. Reappearance triggers a `b_side` restart after the device is stable.
|
- Camera disappearance is logged as degraded state. Reappearance triggers a `b_side` restart after the device is stable.
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|||||||
@@ -22,6 +22,10 @@ LAST_ACTION_EPOCH_MS=0
|
|||||||
FULL_RESTART_WINDOW_START=0
|
FULL_RESTART_WINDOW_START=0
|
||||||
FULL_RESTART_WINDOW_COUNT=0
|
FULL_RESTART_WINDOW_COUNT=0
|
||||||
NETWORK_LAST_INTERFACE=""
|
NETWORK_LAST_INTERFACE=""
|
||||||
|
GPS_LAST_CHECK_SEC=0
|
||||||
|
GPS_DEVICE_PRESENT_PREV=-1
|
||||||
|
GPS_DEVICE_PRESENT_STATE=1
|
||||||
|
GPS_STACK_ACTIVE_STATE=1
|
||||||
declare -A TARGETED_RESTART_WINDOW_START=()
|
declare -A TARGETED_RESTART_WINDOW_START=()
|
||||||
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
||||||
|
|
||||||
@@ -37,6 +41,113 @@ service_is_active() {
|
|||||||
systemctl is-active --quiet "$1"
|
systemctl is-active --quiet "$1"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gps_monitor_enabled() {
|
||||||
|
[[ "${BLITZ_GPS_MONITOR_ENABLED:-0}" == "1" ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
gps_stack_active() {
|
||||||
|
local units=()
|
||||||
|
local unit
|
||||||
|
|
||||||
|
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
|
||||||
|
if (( ${#units[@]} == 0 )); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for unit in "${units[@]}"; do
|
||||||
|
if service_is_active "${unit}"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_gps_stack() {
|
||||||
|
local reason="$1"
|
||||||
|
local devices="$2"
|
||||||
|
local units=()
|
||||||
|
local rc
|
||||||
|
|
||||||
|
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
|
||||||
|
if (( ${#units[@]} == 0 )); then
|
||||||
|
GPS_STACK_ACTIVE_STATE=0
|
||||||
|
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=empty" 1
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
set_last_action "gps-reconnect"
|
||||||
|
blitz_log "${STEP}" "gps-reconnect" "start" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
|
||||||
|
if systemctl restart "${units[@]}"; then
|
||||||
|
GPS_STACK_ACTIVE_STATE=1
|
||||||
|
blitz_log "${STEP}" "gps-reconnect" "success" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
rc=$?
|
||||||
|
GPS_STACK_ACTIVE_STATE=0
|
||||||
|
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" "${rc}"
|
||||||
|
return "${rc}"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_gps_health() {
|
||||||
|
local now_sec="$1"
|
||||||
|
local check_interval_sec="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
|
||||||
|
local device_glob="${BLITZ_GPS_DEVICE_GLOB:-}"
|
||||||
|
local previous_present="${GPS_DEVICE_PRESENT_PREV}"
|
||||||
|
local recovery_reason=""
|
||||||
|
local device_summary=""
|
||||||
|
local -a devices=()
|
||||||
|
|
||||||
|
if ! gps_monitor_enabled; then
|
||||||
|
GPS_DEVICE_PRESENT_STATE=1
|
||||||
|
GPS_STACK_ACTIVE_STATE=1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( check_interval_sec < 1 )); then
|
||||||
|
check_interval_sec=1
|
||||||
|
fi
|
||||||
|
if (( GPS_LAST_CHECK_SEC != 0 && now_sec - GPS_LAST_CHECK_SEC < check_interval_sec )); then
|
||||||
|
if (( GPS_DEVICE_PRESENT_STATE == 1 && GPS_STACK_ACTIVE_STATE == 1 )); then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
GPS_LAST_CHECK_SEC="${now_sec}"
|
||||||
|
|
||||||
|
mapfile -t devices < <(compgen -G "${device_glob}" || true)
|
||||||
|
if (( ${#devices[@]} == 0 )); then
|
||||||
|
GPS_DEVICE_PRESENT_STATE=0
|
||||||
|
GPS_STACK_ACTIVE_STATE=0
|
||||||
|
if (( previous_present != 0 )); then
|
||||||
|
blitz_log "${STEP}" "gps-device-check" "failure" "state=missing glob=${device_glob}" 1
|
||||||
|
fi
|
||||||
|
GPS_DEVICE_PRESENT_PREV=0
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
device_summary="$(IFS=,; printf '%s' "${devices[*]}")"
|
||||||
|
GPS_DEVICE_PRESENT_STATE=1
|
||||||
|
GPS_DEVICE_PRESENT_PREV=1
|
||||||
|
|
||||||
|
if (( previous_present == 0 )); then
|
||||||
|
blitz_log "${STEP}" "gps-device-check" "success" "state=reappeared devices=${device_summary}" 0
|
||||||
|
recovery_reason="device-reappeared"
|
||||||
|
elif ! gps_stack_active; then
|
||||||
|
recovery_reason="gpsd-inactive"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${recovery_reason}" ]]; then
|
||||||
|
if restart_gps_stack "${recovery_reason}" "${device_summary}"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
GPS_STACK_ACTIVE_STATE=1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
status_file_fresh() {
|
status_file_fresh() {
|
||||||
local path="$1"
|
local path="$1"
|
||||||
local max_age_sec="$2"
|
local max_age_sec="$2"
|
||||||
@@ -97,6 +208,8 @@ write_watchdog_status() {
|
|||||||
local camera_ok="$4"
|
local camera_ok="$4"
|
||||||
local ros_ok="$5"
|
local ros_ok="$5"
|
||||||
local bside_ok="$6"
|
local bside_ok="$6"
|
||||||
|
local gps_ok="$7"
|
||||||
|
local gps_device_present="$8"
|
||||||
local tmp_file
|
local tmp_file
|
||||||
|
|
||||||
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
|
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
|
||||||
@@ -109,6 +222,8 @@ write_watchdog_status() {
|
|||||||
"camera_ok": ${camera_ok},
|
"camera_ok": ${camera_ok},
|
||||||
"ros_ok": ${ros_ok},
|
"ros_ok": ${ros_ok},
|
||||||
"bside_ok": ${bside_ok},
|
"bside_ok": ${bside_ok},
|
||||||
|
"gps_ok": ${gps_ok},
|
||||||
|
"gps_device_present": ${gps_device_present},
|
||||||
"network_fail_count": ${NETWORK_FAIL_COUNT},
|
"network_fail_count": ${NETWORK_FAIL_COUNT},
|
||||||
"targeted_restart_count": $(targeted_restart_total),
|
"targeted_restart_count": $(targeted_restart_total),
|
||||||
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
|
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
|
||||||
@@ -427,13 +542,22 @@ while true; do
|
|||||||
camera_ok=1
|
camera_ok=1
|
||||||
ros_ok=1
|
ros_ok=1
|
||||||
bside_ok=1
|
bside_ok=1
|
||||||
|
gps_ok=1
|
||||||
|
gps_device_present=1
|
||||||
RECOVERY_ACTION_TAKEN=0
|
RECOVERY_ACTION_TAKEN=0
|
||||||
now_sec="$(now_epoch_sec)"
|
now_sec="$(now_epoch_sec)"
|
||||||
|
|
||||||
|
if gps_monitor_enabled; then
|
||||||
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
||||||
|
if (( GPS_DEVICE_PRESENT_STATE == 0 || GPS_STACK_ACTIVE_STATE == 0 )); then
|
||||||
|
gps_ok=0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
if (( BACKOFF_UNTIL > now_sec )); then
|
if (( BACKOFF_UNTIL > now_sec )); then
|
||||||
fault_reason="backoff"
|
fault_reason="backoff"
|
||||||
recovery_state="backoff"
|
recovery_state="backoff"
|
||||||
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0
|
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
||||||
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
@@ -453,6 +577,22 @@ while true; do
|
|||||||
NETWORK_FAIL_COUNT=0
|
NETWORK_FAIL_COUNT=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if check_gps_health "${now_sec}"; then
|
||||||
|
gps_ok=1
|
||||||
|
else
|
||||||
|
gps_ok=0
|
||||||
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
||||||
|
if [[ "${fault_reason}" == "none" ]]; then
|
||||||
|
if (( GPS_DEVICE_PRESENT_STATE == 0 )); then
|
||||||
|
fault_reason="gps_device_missing"
|
||||||
|
else
|
||||||
|
fault_reason="gps_reconnect_failed"
|
||||||
|
fi
|
||||||
|
recovery_state="degraded"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
|
||||||
|
|
||||||
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
|
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
|
||||||
camera_ok=0
|
camera_ok=0
|
||||||
fault_reason="camera_missing"
|
fault_reason="camera_missing"
|
||||||
@@ -486,6 +626,6 @@ while true; do
|
|||||||
full_restart_stack "ros-unhealthy" || true
|
full_restart_stack "ros-unhealthy" || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}"
|
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
||||||
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -73,6 +73,10 @@ blitz_load_boot_env() {
|
|||||||
export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}"
|
export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}"
|
||||||
export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}"
|
export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}"
|
||||||
export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}"
|
export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}"
|
||||||
|
export BLITZ_GPS_MONITOR_ENABLED="${BLITZ_GPS_MONITOR_ENABLED:-1}"
|
||||||
|
export BLITZ_GPS_DEVICE_GLOB="${BLITZ_GPS_DEVICE_GLOB:-/dev/ttyCH341USB*}"
|
||||||
|
export BLITZ_GPS_CHECK_INTERVAL_SEC="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
|
||||||
|
export BLITZ_GPS_RESTART_UNITS="${BLITZ_GPS_RESTART_UNITS:-gpsd.socket gpsd.service}"
|
||||||
export BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION:-0}"
|
export BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION:-0}"
|
||||||
export BLITZ_BOOT_ENV_LOADED="1"
|
export BLITZ_BOOT_ENV_LOADED="1"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"interface": "enx08711b726c22",
|
"interface": "enxd41a57335f9d",
|
||||||
"ipv4": [
|
"ipv4": [
|
||||||
"192.168.225.66/22"
|
"192.168.225.83/22"
|
||||||
],
|
],
|
||||||
"ipv6": [
|
"ipv6": [
|
||||||
"fe80::86e0:4771:425d:8b20/64"
|
"fe80::18c1:e89d:e033:9857/64"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -27,6 +27,10 @@ BLITZ_HEALTH_STALE_SEC="15"
|
|||||||
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
|
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
|
||||||
BLITZ_NETWORK_FAIL_THRESHOLD="3"
|
BLITZ_NETWORK_FAIL_THRESHOLD="3"
|
||||||
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
|
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
|
||||||
|
BLITZ_GPS_MONITOR_ENABLED="1"
|
||||||
|
BLITZ_GPS_DEVICE_GLOB="/dev/ttyCH341USB*"
|
||||||
|
BLITZ_GPS_CHECK_INTERVAL_SEC="10"
|
||||||
|
BLITZ_GPS_RESTART_UNITS="gpsd.socket gpsd.service"
|
||||||
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
|
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
|
||||||
|
|
||||||
# Boot units run b_side_omnid as root directly, so nested sudo must stay off.
|
# Boot units run b_side_omnid as root directly, so nested sudo must stay off.
|
||||||
|
|||||||
Reference in New Issue
Block a user