feat: 日志增强功能
This commit is contained in:
@@ -12,6 +12,10 @@ B_SIDE_STATUS_FILE=""
|
||||
ROS_STATUS_FILE=""
|
||||
WATCHDOG_STATUS_FILE=""
|
||||
NETWORK_FAULT_FILE=""
|
||||
WATCHDOG_EVENT_LOG=""
|
||||
WATCHDOG_SAMPLE_LOG=""
|
||||
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
|
||||
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
|
||||
CAMERA_MISSING_PREV=0
|
||||
CAMERA_RECOVERY_STABLE_COUNT=0
|
||||
NETWORK_FAIL_COUNT=0
|
||||
@@ -28,6 +32,8 @@ GPS_LAST_CHECK_SEC=0
|
||||
GPS_DEVICE_PRESENT_PREV=-1
|
||||
GPS_DEVICE_PRESENT_STATE=1
|
||||
GPS_STACK_ACTIVE_STATE=1
|
||||
LAST_REPORTED_FAULT_REASON=""
|
||||
LAST_REPORTED_RECOVERY_STATE=""
|
||||
declare -A TARGETED_RESTART_WINDOW_START=()
|
||||
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
||||
|
||||
@@ -236,6 +242,116 @@ EOF
|
||||
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
|
||||
}
|
||||
|
||||
watchdog_emit_json() {
|
||||
local record_type="$1"
|
||||
local action="$2"
|
||||
local fault_reason="$3"
|
||||
local recovery_state="$4"
|
||||
local detail="$5"
|
||||
local incident_id="${6:-}"
|
||||
local network_ok="${7:-1}"
|
||||
local camera_ok="${8:-1}"
|
||||
local ros_ok="${9:-1}"
|
||||
local bside_ok="${10:-1}"
|
||||
local gps_ok="${11:-1}"
|
||||
local gps_device_present="${12:-1}"
|
||||
|
||||
python3 - "${record_type}" "${action}" "${fault_reason}" "${recovery_state}" "${detail}" "${incident_id}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" "${LAST_ACTION}" "${LAST_ACTION_EPOCH_MS}" "${NETWORK_FAIL_COUNT}" "$(targeted_restart_total)" "${FULL_RESTART_WINDOW_COUNT}" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
|
||||
record_type, action, fault_reason, recovery_state, detail, incident_id, network_ok, camera_ok, ros_ok, bside_ok, gps_ok, gps_device_present, last_action, last_action_epoch_ms, network_fail_count, targeted_restart_count, full_restart_count = sys.argv[1:18]
|
||||
payload = {
|
||||
"ts_unix_ms": time.time_ns() // 1_000_000,
|
||||
"record_type": record_type,
|
||||
"action": action,
|
||||
"fault_reason": fault_reason,
|
||||
"recovery_state": recovery_state,
|
||||
"detail": detail,
|
||||
"incident_id": incident_id or None,
|
||||
"network_ok": network_ok == "1",
|
||||
"camera_ok": camera_ok == "1",
|
||||
"ros_ok": ros_ok == "1",
|
||||
"bside_ok": bside_ok == "1",
|
||||
"gps_ok": gps_ok == "1",
|
||||
"gps_device_present": gps_device_present == "1",
|
||||
"network_fail_count": int(network_fail_count),
|
||||
"targeted_restart_count": int(targeted_restart_count),
|
||||
"full_restart_count": int(full_restart_count),
|
||||
"last_action": last_action,
|
||||
"last_action_epoch_ms": int(last_action_epoch_ms or 0),
|
||||
}
|
||||
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
|
||||
PY
|
||||
}
|
||||
|
||||
watchdog_append_event() {
|
||||
local line=""
|
||||
|
||||
[[ -n "${WATCHDOG_EVENT_LOG}" ]] || return 0
|
||||
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
|
||||
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
|
||||
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true
|
||||
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then
|
||||
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
|
||||
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true
|
||||
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
|
||||
}
|
||||
|
||||
watchdog_append_sample() {
|
||||
local line=""
|
||||
|
||||
[[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0
|
||||
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
|
||||
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
|
||||
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true
|
||||
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then
|
||||
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
|
||||
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true
|
||||
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
|
||||
}
|
||||
|
||||
watchdog_record_state_transition() {
|
||||
local fault_reason="$1"
|
||||
local recovery_state="$2"
|
||||
|
||||
if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" ""
|
||||
LAST_REPORTED_FAULT_REASON="${fault_reason}"
|
||||
LAST_REPORTED_RECOVERY_STATE="${recovery_state}"
|
||||
}
|
||||
|
||||
watchdog_launch_incident() {
|
||||
local reason="$1"
|
||||
local unit_name="$2"
|
||||
|
||||
"${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" \
|
||||
--source watchdog \
|
||||
--reason "${reason}" \
|
||||
--unit "${unit_name}" \
|
||||
--result failure \
|
||||
--exit-status 1 2>/dev/null || true
|
||||
}
|
||||
|
||||
set_last_action() {
|
||||
LAST_ACTION="$1"
|
||||
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
|
||||
@@ -283,6 +399,7 @@ record_full_restart() {
|
||||
fi
|
||||
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
|
||||
BACKOFF_UNTIL=$(( now_sec + 60 ))
|
||||
watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" ""
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -290,36 +407,45 @@ restart_bside_targeted() {
|
||||
local fault_key="$1"
|
||||
local reason="$2"
|
||||
local rc
|
||||
local incident_id=""
|
||||
|
||||
if register_targeted_restart "${fault_key}"; then
|
||||
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
|
||||
watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" ""
|
||||
full_restart_stack "${reason}-escalated"
|
||||
return 0
|
||||
fi
|
||||
|
||||
incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")"
|
||||
set_last_action "restart-bside"
|
||||
RECOVERY_ACTION_TAKEN=1
|
||||
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
|
||||
watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
|
||||
if systemctl restart "${B_SIDE_SERVICE}"; then
|
||||
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
|
||||
watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
rc=$?
|
||||
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
|
||||
watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}"
|
||||
return "${rc}"
|
||||
}
|
||||
|
||||
full_restart_stack() {
|
||||
local reason="$1"
|
||||
local rc
|
||||
local incident_id=""
|
||||
|
||||
incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")"
|
||||
set_last_action "full-restart"
|
||||
RECOVERY_ACTION_TAKEN=1
|
||||
recovery_state="recovering"
|
||||
fault_reason="${reason}"
|
||||
|
||||
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
|
||||
watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}"
|
||||
systemctl stop "${B_SIDE_SERVICE}" || true
|
||||
|
||||
if systemctl restart "${ROS_SERVICE}"; then
|
||||
@@ -345,9 +471,11 @@ full_restart_stack() {
|
||||
else
|
||||
rc=$?
|
||||
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
|
||||
watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}"
|
||||
record_full_restart
|
||||
return "${rc}"
|
||||
fi
|
||||
watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}"
|
||||
record_full_restart
|
||||
}
|
||||
|
||||
@@ -659,6 +787,7 @@ wait_for_network_recovery() {
|
||||
|
||||
perform_network_recovery() {
|
||||
local rc=0
|
||||
local incident_id=""
|
||||
|
||||
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
||||
set_last_action "route-repair"
|
||||
@@ -666,12 +795,15 @@ perform_network_recovery() {
|
||||
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
||||
NETWORK_FAIL_COUNT=0
|
||||
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
|
||||
watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" ""
|
||||
return 0
|
||||
fi
|
||||
|
||||
incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")"
|
||||
set_last_action "network-recovery"
|
||||
RECOVERY_ACTION_TAKEN=1
|
||||
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
|
||||
watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}"
|
||||
systemctl stop "${B_SIDE_SERVICE}" || true
|
||||
|
||||
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
|
||||
@@ -679,6 +811,7 @@ perform_network_recovery() {
|
||||
else
|
||||
rc=$?
|
||||
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
|
||||
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}"
|
||||
return "${rc}"
|
||||
fi
|
||||
|
||||
@@ -687,11 +820,13 @@ perform_network_recovery() {
|
||||
else
|
||||
rc=$?
|
||||
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
|
||||
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}"
|
||||
return "${rc}"
|
||||
fi
|
||||
|
||||
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
||||
NETWORK_FAIL_COUNT=0
|
||||
watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}"
|
||||
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
||||
restart_bside_targeted "network" "network-recovered"
|
||||
return 0
|
||||
@@ -707,11 +842,14 @@ blitz_require_command stat "${STEP}"
|
||||
blitz_require_command ping "${STEP}"
|
||||
blitz_require_command python3 "${STEP}"
|
||||
blitz_prepare_runtime_dir
|
||||
blitz_require_run_context
|
||||
|
||||
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
|
||||
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
||||
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
|
||||
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
|
||||
WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl"
|
||||
WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl"
|
||||
|
||||
while true; do
|
||||
fault_reason="none"
|
||||
@@ -735,7 +873,9 @@ while true; do
|
||||
if (( BACKOFF_UNTIL > now_sec )); then
|
||||
fault_reason="backoff"
|
||||
recovery_state="backoff"
|
||||
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
|
||||
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
||||
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
||||
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
||||
continue
|
||||
fi
|
||||
@@ -824,6 +964,8 @@ while true; do
|
||||
full_restart_stack "ros-unhealthy" || true
|
||||
fi
|
||||
|
||||
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
|
||||
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
||||
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
||||
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user