From e895cdc9def427aa69a50bff3a7dc67295d5e730 Mon Sep 17 00:00:00 2001 From: Mock Date: Tue, 14 Apr 2026 20:52:41 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=97=A5=E5=BF=97=E5=A2=9E=E5=BC=BA?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/b_side_omnid.c | 81 ++++++- include/omni_common.h | 11 + include/video_pipeline.h | 2 + scripts/boot/blitz-5g-link-logger.sh | 137 +++++++++++ scripts/boot/blitz-incident-capture-launch.sh | 50 ++++ scripts/boot/blitz-incident-capture.sh | 131 ++++++++++ scripts/boot/blitz-run-context.sh | 14 ++ scripts/boot/blitz-watchdog.sh | 142 +++++++++++ scripts/boot/common.sh | 225 +++++++++++++++++ scripts/boot/disable-systemd.sh | 2 + scripts/boot/install-systemd.sh | 3 + scripts/boot/robot-boot.env | 11 + scripts/boot/start-5g-link-logger-service.sh | 18 ++ scripts/boot/start-b-side-omnid-service.sh | 1 + scripts/boot/start-ros-receiver-service.sh | 1 + scripts/boot/systemd/blitz-5g-dial.service.in | 5 +- .../systemd/blitz-5g-link-logger.service.in | 18 ++ .../systemd/blitz-b-side-omnid.service.in | 7 +- .../boot/systemd/blitz-boot-gate.service.in | 1 + scripts/boot/systemd/blitz-robot.target.in | 2 + .../systemd/blitz-ros-receiver.service.in | 7 +- .../boot/systemd/blitz-run-context.service.in | 15 ++ .../boot/systemd/blitz-watchdog.service.in | 6 +- scripts/dev/load-env.sh | 159 ++++++++++++ scripts/dev/reset-run-context.sh | 10 + scripts/dev/start-5g-link-logger.sh | 9 + scripts/dev/start-b-side-omnid.sh | 25 +- scripts/dev/start-backend.sh | 1 + src/kcp_packet_debug.c | 2 +- src/kcp_session_stats.c | 2 +- src/latencylog.c | 2 +- src/omni_common.c | 229 +++++++++++++++++- src/tx_timestamp_debug.c | 2 +- src/video_pipeline.c | 7 +- src/video_pipeline_gps.c | 7 +- 35 files changed, 1324 insertions(+), 21 deletions(-) create mode 100644 scripts/boot/blitz-5g-link-logger.sh create mode 100644 scripts/boot/blitz-incident-capture-launch.sh create mode 100644 scripts/boot/blitz-incident-capture.sh create mode 100644 scripts/boot/blitz-run-context.sh create mode 100644 scripts/boot/start-5g-link-logger-service.sh create mode 100644 scripts/boot/systemd/blitz-5g-link-logger.service.in create mode 100644 scripts/boot/systemd/blitz-run-context.service.in create mode 100644 scripts/dev/reset-run-context.sh create mode 100644 scripts/dev/start-5g-link-logger.sh diff --git a/cmd/b_side_omnid.c b/cmd/b_side_omnid.c index d7e721b..7e267bc 100644 --- a/cmd/b_side_omnid.c +++ b/cmd/b_side_omnid.c @@ -14,6 +14,7 @@ #include "cJSON.h" #include "control_protocol.h" +#include "latencylog.h" #include "protocol.h" #include "video_pipeline.h" @@ -26,6 +27,8 @@ #define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall" #define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall" #define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15 +#define DEFAULT_KCP_STATS_INTERVAL_MS 1000 +#define DEFAULT_CONTROL_LATENCY_SAMPLE_MOD 100 #define EXIT_CODE_VIDEO_THREAD_STALLED 101 #define EXIT_CODE_CONTROL_THREAD_STALLED 102 @@ -65,11 +68,15 @@ typedef struct daemon_state { int control_server_idle_reconnect_ms; const char *runtime_dir; int heartbeat_timeout_sec; + int stats_interval_ms; + uint64_t control_latency_sample_mod; char status_file_path[512]; char video_thread_fault_file[512]; char control_thread_fault_file[512]; atomic_long video_thread_heartbeat_epoch_sec; atomic_long control_thread_heartbeat_epoch_sec; + kcp_session_stats_logger_t *stats_logger; + latency_logger_t *control_latency_logger; unix_dgram_client_t unix_client; control_bridge_stats_t control_stats; } daemon_state_t; @@ -127,6 +134,21 @@ static int env_int_or_default(const char *name, int fallback) { return parsed; } +static uint64_t env_u64_or_default(const char *name, uint64_t fallback) { + const char *value = getenv(name); + unsigned long long parsed = 0ULL; + char *endptr = NULL; + + if (value == NULL || value[0] == '\0') { + return fallback; + } + parsed = strtoull(value, &endptr, 10); + if (endptr == value || *endptr != '\0' || parsed == 0ULL) { + return fallback; + } + return (uint64_t) parsed; +} + static int64_t realtime_epoch_ms(void) { struct timespec ts; @@ -145,6 +167,19 @@ static void update_thread_heartbeat(atomic_long *heartbeat) { atomic_store(heartbeat, realtime_epoch_sec()); } +static int should_log_control_latency(const daemon_state_t *state, const message_t *msg) { + uint64_t sample_mod; + + if (state == NULL || state->control_latency_logger == NULL || msg == NULL) { + return 0; + } + sample_mod = state->control_latency_sample_mod; + if (sample_mod <= 1U) { + return 1; + } + return msg->id % sample_mod == 0U; +} + static void video_pipeline_heartbeat_progress(void *context) { update_thread_heartbeat((atomic_long *) context); } @@ -556,8 +591,8 @@ static void *control_thread_main(void *arg) { &options, NULL, NULL, - NULL, - KCP_DEFAULT_STATS_INTERVAL_MS + state->stats_logger, + state->stats_interval_ms ); if (client == NULL) { control_bridge_set_errno_error(&state->control_stats, "failed to connect control session"); @@ -692,6 +727,11 @@ static void *control_thread_main(void *arg) { continue; } + if (should_log_control_latency(state, &msg)) { + latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_APP_RECV, &msg); + latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_BEGIN, &msg); + } + if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) { int send_errno = errno; int recovered = 0; @@ -708,6 +748,9 @@ static void *control_thread_main(void *arg) { state->control_stats.server_idle_ms = client_state.server_idle_ms; kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); + if (should_log_control_latency(state, &msg)) { + latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_END, &msg); + } protocol_message_clear(&msg); continue; } @@ -728,6 +771,9 @@ static void *control_thread_main(void *arg) { state->control_stats.server_idle_ms = client_state.server_idle_ms; kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); + if (should_log_control_latency(state, &msg)) { + latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_END, &msg); + } protocol_message_clear(&msg); } @@ -799,8 +845,12 @@ int main(void) { "BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC", DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC ); + state.stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", DEFAULT_KCP_STATS_INTERVAL_MS); + state.control_latency_sample_mod = env_u64_or_default("BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD", DEFAULT_CONTROL_LATENCY_SAMPLE_MOD); state.video_config.progress_callback = video_pipeline_heartbeat_progress; state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec; + state.video_config.stats_logger = NULL; + state.video_config.stats_interval_ms = state.stats_interval_ms; state.control_server_idle_reconnect_ms = env_int_or_default( "OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS", CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS @@ -860,11 +910,34 @@ int main(void) { return 1; } + { + const char *stats_log_path = getenv("BLITZ_KCP_STATS_LOG_PATH"); + const char *latency_log_path = getenv("BLITZ_CONTROL_LATENCY_LOG_PATH"); + int latency_enabled = env_int_or_default("BLITZ_CONTROL_LATENCY_LOG_ENABLED", 1); + + if (stats_log_path != NULL && stats_log_path[0] != '\0') { + state.stats_logger = kcp_session_stats_open_jsonl(stats_log_path); + if (state.stats_logger == NULL) { + fprintf(stderr, "[b_side_omnid] warning: failed to open KCP stats log %s\n", stats_log_path); + } + } + if (latency_enabled && latency_log_path != NULL && latency_log_path[0] != '\0') { + state.control_latency_logger = latencylog_open_jsonl(latency_log_path); + if (state.control_latency_logger == NULL) { + fprintf(stderr, "[b_side_omnid] warning: failed to open control latency log %s\n", latency_log_path); + } + } + state.video_config.stats_logger = state.stats_logger; + state.video_config.stats_interval_ms = state.stats_interval_ms; + } + if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) { perror("pthread_create(video_thread)"); unix_dgram_client_close(&state.unix_client); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); + latencylog_close(state.control_latency_logger); + kcp_session_stats_close(state.stats_logger); return 1; } if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) { @@ -874,6 +947,8 @@ int main(void) { unix_dgram_client_close(&state.unix_client); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); + latencylog_close(state.control_latency_logger); + kcp_session_stats_close(state.stats_logger); return 1; } @@ -891,5 +966,7 @@ int main(void) { unix_dgram_client_close(&state.unix_client); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); + latencylog_close(state.control_latency_logger); + kcp_session_stats_close(state.stats_logger); return 0; } diff --git a/include/omni_common.h b/include/omni_common.h index 7f8bfbb..09b8432 100644 --- a/include/omni_common.h +++ b/include/omni_common.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -31,6 +32,15 @@ typedef struct omni_file_logger { FILE *file; pthread_mutex_t mutex; + char path[PATH_MAX]; + size_t current_bytes; + size_t buffered_bytes; + size_t flush_bytes; + size_t max_bytes; + int flush_interval_ms; + int max_files; + int immediate_flush; + uint64_t last_flush_monotonic_ms; } omni_file_logger_t; int64_t omni_now_unix_nano(void); @@ -61,6 +71,7 @@ double omni_duration_ms_to_ns(double ms); const char *omni_path_base_name(const char *path); void omni_file_logger_init(omni_file_logger_t *logger, FILE *file); +void omni_file_logger_init_path(omni_file_logger_t *logger, FILE *file, const char *path, int immediate_flush); void omni_file_logger_destroy(omni_file_logger_t *logger); int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line); diff --git a/include/video_pipeline.h b/include/video_pipeline.h index dcb1afa..914ff38 100644 --- a/include/video_pipeline.h +++ b/include/video_pipeline.h @@ -43,6 +43,8 @@ typedef struct video_pipeline_config { int hard_backpressure_hold_ms; int server_idle_reconnect_ms; int frame_stall_reconnect_ms; + kcp_session_stats_logger_t *stats_logger; + int stats_interval_ms; video_pipeline_progress_fn progress_callback; void *progress_context; } video_pipeline_config_t; diff --git a/scripts/boot/blitz-5g-link-logger.sh b/scripts/boot/blitz-5g-link-logger.sh new file mode 100644 index 0000000..bfcdfcd --- /dev/null +++ b/scripts/boot/blitz-5g-link-logger.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="5g-link-logger" + +resolve_target_ip() { + if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then + printf '%s\n' "${BLITZ_TIME_SERVER_IP}" + return 0 + fi + + for candidate in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do + if [[ -n "${candidate}" ]]; then + printf '%s\n' "${candidate}" + return 0 + fi + done + return 1 +} + +emit_sample_json() { + local interface_name="${1:-}" + local target_ip="${2:-}" + + python3 - "${interface_name}" "${target_ip}" <<'PY' +import json +import subprocess +import sys +import time + +interface_name = sys.argv[1] +target_ip = sys.argv[2] + +payload = { + "ts_unix_ms": time.time_ns() // 1_000_000, + "interface": interface_name, + "target_ip": target_ip, + "link_present": False, + "route_output": "", + "route_ok": False, + "probe_ok": False, + "ping_rtt_ms": None, + "rx_bytes": 0, + "tx_bytes": 0, + "rx_packets": 0, + "tx_packets": 0, + "rx_errors": 0, + "tx_errors": 0, + "rx_drops": 0, + "tx_drops": 0, +} + +if interface_name: + try: + output = subprocess.check_output( + ["ip", "-j", "-s", "link", "show", "dev", interface_name], + text=True, + stderr=subprocess.DEVNULL, + ) + stats = json.loads(output) + if stats: + item = stats[0] + payload["link_present"] = True + rx = item.get("stats64", {}).get("rx", {}) + tx = item.get("stats64", {}).get("tx", {}) + if not rx and not tx: + rx = item.get("stats", {}).get("rx", {}) + tx = item.get("stats", {}).get("tx", {}) + payload["rx_bytes"] = int(rx.get("bytes") or 0) + payload["tx_bytes"] = int(tx.get("bytes") or 0) + payload["rx_packets"] = int(rx.get("packets") or 0) + payload["tx_packets"] = int(tx.get("packets") or 0) + payload["rx_errors"] = int(rx.get("errors") or 0) + payload["tx_errors"] = int(tx.get("errors") or 0) + payload["rx_drops"] = int(rx.get("dropped") or 0) + payload["tx_drops"] = int(tx.get("dropped") or 0) + except Exception: + pass + +if target_ip: + try: + route = subprocess.check_output( + ["ip", "route", "get", target_ip], + text=True, + stderr=subprocess.STDOUT, + ).strip() + payload["route_output"] = route.splitlines()[0] if route else "" + payload["route_ok"] = bool(payload["route_output"]) and ( + not interface_name or f" dev {interface_name}" in payload["route_output"] + ) + except Exception as exc: + payload["route_output"] = str(exc) + + ping_cmd = ["ping", "-c", "1", "-W", "2", target_ip] + if interface_name: + ping_cmd[1:1] = ["-I", interface_name] + ping = subprocess.run(ping_cmd, capture_output=True, text=True) + payload["probe_ok"] = ping.returncode == 0 + output = (ping.stdout or "") + "\n" + (ping.stderr or "") + for token in output.replace("\n", " ").split(): + if token.startswith("time="): + value = token.split("=", 1)[1].rstrip("ms") + try: + payload["ping_rtt_ms"] = float(value) + except ValueError: + pass + break + +print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False)) +PY +} + +if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then + blitz_load_boot_env + blitz_require_run_context +fi + +if [[ -z "${BLITZ_RUN_DIR:-}" && -f "${BLITZ_RUN_CONTEXT_FILE:-}" ]]; then + blitz_load_run_context_env || true +fi +blitz_ensure_instance_id + +export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-${BLITZ_RUN_DIR}/b-5g-link-quality.${BLITZ_INSTANCE_ID}.jsonl}" +target_ip="$(resolve_target_ip || true)" + +blitz_log "${STEP}" "start" "start" "path=${BLITZ_5G_LINK_LOG_PATH} interval_sec=${BLITZ_5G_LINK_LOG_INTERVAL_SEC}" 0 + +while true; do + interface_name="$(blitz_resolve_5g_interface || true)" + line="$(emit_sample_json "${interface_name}" "${target_ip}")" + blitz_jsonl_append_line "${BLITZ_5G_LINK_LOG_PATH}" "${line}" + sleep "${BLITZ_5G_LINK_LOG_INTERVAL_SEC}" +done diff --git a/scripts/boot/blitz-incident-capture-launch.sh b/scripts/boot/blitz-incident-capture-launch.sh new file mode 100644 index 0000000..0bd788b --- /dev/null +++ b/scripts/boot/blitz-incident-capture-launch.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="incident-launch" +incident_id="" +args=() +timeout_bin="" + +while (($# > 0)); do + case "$1" in + --incident-id) + incident_id="${2:-}" + shift 2 + ;; + *) + args+=("$1") + shift + ;; + esac +done + +blitz_load_boot_env +blitz_require_root "${STEP}" +blitz_require_command systemd-run "${STEP}" +blitz_require_command timeout "${STEP}" +timeout_bin="$(command -v timeout)" + +if [[ -z "${incident_id}" ]]; then + incident_id="$(blitz_new_incident_id)" +fi + +unit_name="blitz-incident-${incident_id//[^A-Za-z0-9_.-]/-}" + +systemd-run \ + --quiet \ + --collect \ + --unit "${unit_name}" \ + --property=Type=oneshot \ + --property="StandardOutput=append:${BLITZ_LOG_FILE}" \ + --property="StandardError=append:${BLITZ_LOG_FILE}" \ + "${timeout_bin}" "${BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC}s" \ + /bin/bash "${SCRIPT_DIR}/blitz-incident-capture.sh" \ + --incident-id "${incident_id}" \ + "${args[@]}" + +printf '%s\n' "${incident_id}" diff --git a/scripts/boot/blitz-incident-capture.sh b/scripts/boot/blitz-incident-capture.sh new file mode 100644 index 0000000..c6bcdfd --- /dev/null +++ b/scripts/boot/blitz-incident-capture.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="incident-capture" +incident_id="" +incident_source="" +incident_reason="" +incident_unit="" +incident_result="" +incident_exit_status="" + +run_capture() { + local output_path="$1" + shift + + if command -v timeout >/dev/null 2>&1; then + timeout "${BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC}s" "$@" > "${output_path}" 2>&1 || true + else + "$@" > "${output_path}" 2>&1 || true + fi +} + +while (($# > 0)); do + case "$1" in + --incident-id) + incident_id="${2:-}" + shift 2 + ;; + --source) + incident_source="${2:-}" + shift 2 + ;; + --reason) + incident_reason="${2:-}" + shift 2 + ;; + --unit) + incident_unit="${2:-}" + shift 2 + ;; + --result) + incident_result="${2:-}" + shift 2 + ;; + --exit-status) + incident_exit_status="${2:-}" + shift 2 + ;; + *) + blitz_log "${STEP}" "parse-arg" "failure" "unknown argument: $1" 2 + exit 2 + ;; + esac +done + +if [[ -n "${incident_result}" && "${incident_result}" == "success" ]]; then + exit 0 +fi + +blitz_load_boot_env +blitz_load_run_context_env || true +blitz_prepare_runtime_dir +blitz_prepare_run_root + +if [[ -z "${incident_id}" ]]; then + incident_id="$(blitz_new_incident_id)" +fi + +incident_dir="${BLITZ_RUN_ROOT}/incidents/${incident_id}" +mkdir -p "${incident_dir}" + +python3 - "${incident_dir}/incident.json" "${incident_id}" "${BLITZ_RUN_ID:-}" "${incident_source}" "${incident_reason}" "${incident_unit}" "${incident_result}" "${incident_exit_status}" "${BLITZ_RUN_DIR:-}" "${HOSTNAME:-$(hostname)}" <<'PY' +import json +import sys +import time + +path, incident_id, run_id, source, reason, unit, result, exit_status, run_dir, hostname = sys.argv[1:10] +payload = { + "incident_id": incident_id, + "run_id": run_id, + "source": source, + "fault_reason": reason, + "unit": unit, + "service_result": result, + "exit_status": exit_status, + "run_dir": run_dir, + "hostname": hostname, + "captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), +} +with open(path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True) +PY + +for status_file in \ + "${BLITZ_RUNTIME_DIR}/watchdog.status.json" \ + "${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json" \ + "${BLITZ_RUNTIME_DIR}/ros-receiver.status.json" +do + if [[ -f "${status_file}" ]]; then + cp -f "${status_file}" "${incident_dir}/$(basename "${status_file}")" + fi +done + +if [[ -f "${BLITZ_LOG_FILE}" ]]; then + tail -n 400 "${BLITZ_LOG_FILE}" > "${incident_dir}/startup.log.tail" +fi + +run_capture "${incident_dir}/systemctl-status.txt" \ + systemctl status blitz-robot.target blitz-run-context.service blitz-5g-dial.service blitz-5g-link-logger.service blitz-ros-receiver.service blitz-b-side-omnid.service blitz-watchdog.service +run_capture "${incident_dir}/journal.txt" \ + journalctl --no-pager --since "5 minutes ago" -u blitz-run-context.service -u blitz-5g-dial.service -u blitz-5g-link-logger.service -u blitz-ros-receiver.service -u blitz-b-side-omnid.service -u blitz-watchdog.service +run_capture "${incident_dir}/ip-addr.txt" ip addr +run_capture "${incident_dir}/ip-route.txt" ip route +run_capture "${incident_dir}/ss-uapn.txt" ss -uapn +run_capture "${incident_dir}/ss-xlp.txt" ss -xlp + +if [[ -f "${BLITZ_5G_INFO_JSON:-}" ]]; then + cp -f "${BLITZ_5G_INFO_JSON}" "${incident_dir}/$(basename "${BLITZ_5G_INFO_JSON}")" +fi + +if [[ -n "${BLITZ_RUN_DIR:-}" && -d "${BLITZ_RUN_DIR}" ]]; then + while IFS= read -r -d '' jsonl; do + tail -n 200 "${jsonl}" > "${incident_dir}/tail-$(basename "${jsonl}")" + done < <(find "${BLITZ_RUN_DIR}" -maxdepth 1 -type f -name '*.jsonl' -print0 2>/dev/null) +fi + +blitz_log "${STEP}" "complete" "success" "incident_id=${incident_id} path=${incident_dir}" 0 diff --git a/scripts/boot/blitz-run-context.sh b/scripts/boot/blitz-run-context.sh new file mode 100644 index 0000000..50be475 --- /dev/null +++ b/scripts/boot/blitz-run-context.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="run-context" + +blitz_load_boot_env +blitz_require_root "${STEP}" +blitz_require_command python3 "${STEP}" +blitz_init_run_context +blitz_log "${STEP}" "complete" "success" "run_id=${BLITZ_RUN_ID} run_dir=${BLITZ_RUN_DIR}" 0 diff --git a/scripts/boot/blitz-watchdog.sh b/scripts/boot/blitz-watchdog.sh index 117f026..dbbefec 100644 --- a/scripts/boot/blitz-watchdog.sh +++ b/scripts/boot/blitz-watchdog.sh @@ -12,6 +12,10 @@ B_SIDE_STATUS_FILE="" ROS_STATUS_FILE="" WATCHDOG_STATUS_FILE="" NETWORK_FAULT_FILE="" +WATCHDOG_EVENT_LOG="" +WATCHDOG_SAMPLE_LOG="" +WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0 +WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0 CAMERA_MISSING_PREV=0 CAMERA_RECOVERY_STABLE_COUNT=0 NETWORK_FAIL_COUNT=0 @@ -28,6 +32,8 @@ GPS_LAST_CHECK_SEC=0 GPS_DEVICE_PRESENT_PREV=-1 GPS_DEVICE_PRESENT_STATE=1 GPS_STACK_ACTIVE_STATE=1 +LAST_REPORTED_FAULT_REASON="" +LAST_REPORTED_RECOVERY_STATE="" declare -A TARGETED_RESTART_WINDOW_START=() declare -A TARGETED_RESTART_WINDOW_COUNT=() @@ -236,6 +242,116 @@ EOF mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}" } +watchdog_emit_json() { + local record_type="$1" + local action="$2" + local fault_reason="$3" + local recovery_state="$4" + local detail="$5" + local incident_id="${6:-}" + local network_ok="${7:-1}" + local camera_ok="${8:-1}" + local ros_ok="${9:-1}" + local bside_ok="${10:-1}" + local gps_ok="${11:-1}" + local gps_device_present="${12:-1}" + + python3 - "${record_type}" "${action}" "${fault_reason}" "${recovery_state}" "${detail}" "${incident_id}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" "${LAST_ACTION}" "${LAST_ACTION_EPOCH_MS}" "${NETWORK_FAIL_COUNT}" "$(targeted_restart_total)" "${FULL_RESTART_WINDOW_COUNT}" <<'PY' +import json +import sys +import time + +record_type, action, fault_reason, recovery_state, detail, incident_id, network_ok, camera_ok, ros_ok, bside_ok, gps_ok, gps_device_present, last_action, last_action_epoch_ms, network_fail_count, targeted_restart_count, full_restart_count = sys.argv[1:18] +payload = { + "ts_unix_ms": time.time_ns() // 1_000_000, + "record_type": record_type, + "action": action, + "fault_reason": fault_reason, + "recovery_state": recovery_state, + "detail": detail, + "incident_id": incident_id or None, + "network_ok": network_ok == "1", + "camera_ok": camera_ok == "1", + "ros_ok": ros_ok == "1", + "bside_ok": bside_ok == "1", + "gps_ok": gps_ok == "1", + "gps_device_present": gps_device_present == "1", + "network_fail_count": int(network_fail_count), + "targeted_restart_count": int(targeted_restart_count), + "full_restart_count": int(full_restart_count), + "last_action": last_action, + "last_action_epoch_ms": int(last_action_epoch_ms or 0), +} +print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False)) +PY +} + +watchdog_append_event() { + local line="" + + [[ -n "${WATCHDOG_EVENT_LOG}" ]] || return 0 + if ! line="$(watchdog_emit_json "$@" 2>&1)"; then + if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then + blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true + WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1 + fi + return 0 + fi + if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then + if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then + blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true + WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1 + fi + return 0 + fi + WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0 +} + +watchdog_append_sample() { + local line="" + + [[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0 + if ! line="$(watchdog_emit_json "$@" 2>&1)"; then + if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then + blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true + WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1 + fi + return 0 + fi + if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then + if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then + blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true + WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1 + fi + return 0 + fi + WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0 +} + +watchdog_record_state_transition() { + local fault_reason="$1" + local recovery_state="$2" + + if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then + return 0 + fi + watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" "" + LAST_REPORTED_FAULT_REASON="${fault_reason}" + LAST_REPORTED_RECOVERY_STATE="${recovery_state}" +} + +watchdog_launch_incident() { + local reason="$1" + local unit_name="$2" + + "${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" \ + --source watchdog \ + --reason "${reason}" \ + --unit "${unit_name}" \ + --result failure \ + --exit-status 1 2>/dev/null || true +} + set_last_action() { LAST_ACTION="$1" LAST_ACTION_EPOCH_MS="$(now_epoch_ms)" @@ -283,6 +399,7 @@ record_full_restart() { fi if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then BACKOFF_UNTIL=$(( now_sec + 60 )) + watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" "" fi } @@ -290,36 +407,45 @@ restart_bside_targeted() { local fault_key="$1" local reason="$2" local rc + local incident_id="" if register_targeted_restart "${fault_key}"; then blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0 + watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" "" full_restart_stack "${reason}-escalated" return 0 fi + incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")" set_last_action "restart-bside" RECOVERY_ACTION_TAKEN=1 blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0 + watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}" if systemctl restart "${B_SIDE_SERVICE}"; then blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0 + watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}" return 0 fi rc=$? blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}" + watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}" return "${rc}" } full_restart_stack() { local reason="$1" local rc + local incident_id="" + incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")" set_last_action "full-restart" RECOVERY_ACTION_TAKEN=1 recovery_state="recovering" fault_reason="${reason}" blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0 + watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}" systemctl stop "${B_SIDE_SERVICE}" || true if systemctl restart "${ROS_SERVICE}"; then @@ -345,9 +471,11 @@ full_restart_stack() { else rc=$? blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}" + watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}" record_full_restart return "${rc}" fi + watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}" record_full_restart } @@ -659,6 +787,7 @@ wait_for_network_recovery() { perform_network_recovery() { local rc=0 + local incident_id="" if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then set_last_action "route-repair" @@ -666,12 +795,15 @@ perform_network_recovery() { NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC )) NETWORK_FAIL_COUNT=0 blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0 + watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" "" return 0 fi + incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")" set_last_action "network-recovery" RECOVERY_ACTION_TAKEN=1 blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0 + watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}" systemctl stop "${B_SIDE_SERVICE}" || true if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then @@ -679,6 +811,7 @@ perform_network_recovery() { else rc=$? blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}" + watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}" return "${rc}" fi @@ -687,11 +820,13 @@ perform_network_recovery() { else rc=$? blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}" + watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}" return "${rc}" fi NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC )) NETWORK_FAIL_COUNT=0 + watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}" if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then restart_bside_targeted "network" "network-recovered" return 0 @@ -707,11 +842,14 @@ blitz_require_command stat "${STEP}" blitz_require_command ping "${STEP}" blitz_require_command python3 "${STEP}" blitz_prepare_runtime_dir +blitz_require_run_context B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json" ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json" WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json" NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down" +WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl" +WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl" while true; do fault_reason="none" @@ -735,7 +873,9 @@ while true; do if (( BACKOFF_UNTIL > now_sec )); then fault_reason="backoff" recovery_state="backoff" + watchdog_record_state_transition "${fault_reason}" "${recovery_state}" write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}" + watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}" sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}" continue fi @@ -824,6 +964,8 @@ while true; do full_restart_stack "ros-unhealthy" || true fi + watchdog_record_state_transition "${fault_reason}" "${recovery_state}" write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" + watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}" done diff --git a/scripts/boot/common.sh b/scripts/boot/common.sh index faa57fd..bbbf104 100644 --- a/scripts/boot/common.sh +++ b/scripts/boot/common.sh @@ -33,8 +33,10 @@ blitz_load_boot_env() { return 0 fi + export BLITZ_BOOT_LOADING_ENV="1" # shellcheck disable=SC1091 source "${DEV_SCRIPT_DIR}/load-env.sh" + unset BLITZ_BOOT_LOADING_ENV for env_file in \ "${BOOT_SCRIPT_DIR}/robot-boot.env" \ @@ -51,8 +53,12 @@ blitz_load_boot_env() { default_time_server="$(blitz_host_from_addr "${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}" || true)" export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}" + export BLITZ_RUN_ROOT="${BLITZ_RUN_ROOT:-/var/log/blitz-robot}" export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}" export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-/run/blitz-robot}" + export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUN_CONTEXT_FILE:-${BLITZ_RUNTIME_DIR}/run-context.env}" + export BLITZ_RUN_ID_FILE="${BLITZ_RUN_ID_FILE:-${BLITZ_RUNTIME_DIR}/run-id}" + export BLITZ_CURRENT_RUN_LINK="${BLITZ_CURRENT_RUN_LINK:-${BLITZ_RUN_ROOT}/current}" export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}" export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}" export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}" @@ -71,6 +77,16 @@ blitz_load_boot_env() { export BLITZ_WATCHDOG_INTERVAL_SEC="${BLITZ_WATCHDOG_INTERVAL_SEC:-5}" export BLITZ_HEALTH_STALE_SEC="${BLITZ_HEALTH_STALE_SEC:-15}" export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}" + export BLITZ_KCP_STATS_INTERVAL_MS="${BLITZ_KCP_STATS_INTERVAL_MS:-1000}" + export BLITZ_CONTROL_LATENCY_LOG_ENABLED="${BLITZ_CONTROL_LATENCY_LOG_ENABLED:-1}" + export BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="${BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD:-100}" + export BLITZ_5G_LINK_LOG_INTERVAL_SEC="${BLITZ_5G_LINK_LOG_INTERVAL_SEC:-5}" + export BLITZ_JSONL_FLUSH_INTERVAL_MS="${BLITZ_JSONL_FLUSH_INTERVAL_MS:-1000}" + export BLITZ_JSONL_FLUSH_BYTES="${BLITZ_JSONL_FLUSH_BYTES:-262144}" + export BLITZ_JSONL_ROTATE_BYTES="${BLITZ_JSONL_ROTATE_BYTES:-134217728}" + export BLITZ_JSONL_ROTATE_FILES="${BLITZ_JSONL_ROTATE_FILES:-8}" + export BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC="${BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC:-5}" + export BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC="${BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC:-30}" export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}" export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}" export BLITZ_GPS_MONITOR_ENABLED="${BLITZ_GPS_MONITOR_ENABLED:-1}" @@ -409,3 +425,212 @@ blitz_prepare_runtime_dir() { fi blitz_log "runtime-dir" "prepare" "success" "path=${runtime_dir}" 0 } + +blitz_prepare_run_root() { + local run_root + local run_dir + local incidents_dir + + blitz_load_boot_env + run_root="${BLITZ_RUN_ROOT}" + run_dir="${run_root}/runs" + incidents_dir="${run_root}/incidents" + + mkdir -p "${run_dir}" "${incidents_dir}" + if [[ "${EUID}" -eq 0 ]]; then + chown -R "root:${BLITZ_ROS_USER}" "${run_root}" 2>/dev/null || true + chmod 0775 "${run_root}" "${run_dir}" "${incidents_dir}" 2>/dev/null || true + fi +} + +blitz_load_run_context_env() { + local context_file="${1:-${BLITZ_RUN_CONTEXT_FILE:-}}" + + if [[ -z "${context_file}" || ! -f "${context_file}" ]]; then + return 1 + fi + + set -a + # shellcheck disable=SC1090 + source "${context_file}" + set +a + return 0 +} + +blitz_read_run_id() { + local run_id_file="${BLITZ_RUN_ID_FILE:-}" + + if [[ -z "${run_id_file}" || ! -f "${run_id_file}" ]]; then + return 1 + fi + tr -d '\r\n' < "${run_id_file}" +} + +blitz_utc_compact_timestamp() { + date -u '+%Y%m%dT%H%M%SZ' +} + +blitz_new_run_id() { + printf '%s\n' "$(blitz_utc_compact_timestamp)" +} + +blitz_new_incident_id() { + local prefix="${1:-incident}" + printf '%s-%s-%d\n' "${prefix}" "$(blitz_utc_compact_timestamp)" "$$" +} + +blitz_new_instance_id() { + printf '%s-%d\n' "$(blitz_utc_compact_timestamp)" "$$" +} + +blitz_git_commit() { + git -C "${OMNISOCKETGO_ROOT}" rev-parse HEAD 2>/dev/null || true +} + +blitz_git_dirty_flag() { + if git -C "${OMNISOCKETGO_ROOT}" diff --quiet --ignore-submodules=dirty >/dev/null 2>&1; then + printf '0\n' + return 0 + fi + printf '1\n' +} + +blitz_write_run_context() { + local run_id="$1" + local run_dir="$2" + local boot_id="$3" + local context_file="${BLITZ_RUN_CONTEXT_FILE}" + local id_file="${BLITZ_RUN_ID_FILE}" + local temp_context + local temp_info + local commit_hash + local dirty_flag + local started_at + + commit_hash="$(blitz_git_commit)" + dirty_flag="$(blitz_git_dirty_flag)" + started_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + temp_context="${context_file}.tmp.$$" + temp_info="${run_dir}/run-info.json.tmp.$$" + + mkdir -p "${run_dir}" + printf '%s\n' "${run_id}" > "${id_file}" + + cat > "${temp_context}" </dev/null || echo 0)" + if (( size < max_bytes )); then + return 0 + fi + + for (( index=max_files; index>=1; index-- )); do + if [[ "${index}" -eq "${max_files}" ]]; then + rm -f "${path}.${index}" + fi + if [[ -f "${path}.${index}" ]]; then + mv -f "${path}.${index}" "${path}.$(( index + 1 ))" + fi + done + mv -f "${path}" "${path}.1" +} + +blitz_jsonl_append_line() { + local path="$1" + local line="$2" + + mkdir -p "$(dirname "${path}")" + blitz_jsonl_rotate_if_needed "${path}" + printf '%s\n' "${line}" >> "${path}" +} + +blitz_launch_incident_capture() { + local launch_script="${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" + + if [[ ! -f "${launch_script}" ]]; then + return 1 + fi + "${launch_script}" "$@" >/dev/null 2>&1 || return 1 +} diff --git a/scripts/boot/disable-systemd.sh b/scripts/boot/disable-systemd.sh index b7da8f5..e2f6601 100644 --- a/scripts/boot/disable-systemd.sh +++ b/scripts/boot/disable-systemd.sh @@ -9,9 +9,11 @@ STEP="disable" SYSTEMD_DEST_DIR="/etc/systemd/system" UNITS=( "blitz-watchdog.service" + "blitz-5g-link-logger.service" "blitz-b-side-omnid.service" "blitz-ros-receiver.service" "blitz-5g-dial.service" + "blitz-run-context.service" "blitz-boot-gate.service" "blitz-robot.target" ) diff --git a/scripts/boot/install-systemd.sh b/scripts/boot/install-systemd.sh index 9c91e3d..00145a7 100644 --- a/scripts/boot/install-systemd.sh +++ b/scripts/boot/install-systemd.sh @@ -54,9 +54,12 @@ touch "${BLITZ_LOG_FILE}" chmod 0644 "${BLITZ_LOG_FILE}" blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0 blitz_prepare_runtime_dir +blitz_prepare_run_root install_unit "blitz-boot-gate.service.in" +install_unit "blitz-run-context.service.in" install_unit "blitz-5g-dial.service.in" +install_unit "blitz-5g-link-logger.service.in" install_unit "blitz-ros-receiver.service.in" install_unit "blitz-b-side-omnid.service.in" install_unit "blitz-watchdog.service.in" diff --git a/scripts/boot/robot-boot.env b/scripts/boot/robot-boot.env index cce0160..727767c 100644 --- a/scripts/boot/robot-boot.env +++ b/scripts/boot/robot-boot.env @@ -2,6 +2,7 @@ # Override machine-specific values in robot-boot.env.local. BLITZ_BOOT_DELAY_SEC="30" +BLITZ_RUN_ROOT="/var/log/blitz-robot" BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log" BLITZ_RUNTIME_DIR="/run/blitz-robot" @@ -25,6 +26,16 @@ BLITZ_ROS_SOCKET_WAIT_SEC="20" BLITZ_WATCHDOG_INTERVAL_SEC="5" BLITZ_HEALTH_STALE_SEC="15" BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15" +BLITZ_KCP_STATS_INTERVAL_MS="1000" +BLITZ_CONTROL_LATENCY_LOG_ENABLED="1" +BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="100" +BLITZ_5G_LINK_LOG_INTERVAL_SEC="5" +BLITZ_JSONL_FLUSH_INTERVAL_MS="1000" +BLITZ_JSONL_FLUSH_BYTES="262144" +BLITZ_JSONL_ROTATE_BYTES="134217728" +BLITZ_JSONL_ROTATE_FILES="8" +BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC="5" +BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC="30" BLITZ_NETWORK_FAIL_THRESHOLD="3" BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30" BLITZ_GPS_MONITOR_ENABLED="1" diff --git a/scripts/boot/start-5g-link-logger-service.sh b/scripts/boot/start-5g-link-logger-service.sh new file mode 100644 index 0000000..ea2c051 --- /dev/null +++ b/scripts/boot/start-5g-link-logger-service.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="5g-link-logger-service" + +blitz_load_boot_env +blitz_require_run_context + +export OMNI_BOOT_MODE="1" +export BLITZ_INSTANCE_ID="${BLITZ_INSTANCE_ID:-$(blitz_new_instance_id)}" +export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-${BLITZ_RUN_DIR}/b-5g-link-quality.${BLITZ_INSTANCE_ID}.jsonl}" + +blitz_log "${STEP}" "start" "start" "exec bash ${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh" 0 +exec bash "${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh" diff --git a/scripts/boot/start-b-side-omnid-service.sh b/scripts/boot/start-b-side-omnid-service.sh index d61d553..53eea06 100644 --- a/scripts/boot/start-b-side-omnid-service.sh +++ b/scripts/boot/start-b-side-omnid-service.sh @@ -8,6 +8,7 @@ source "${SCRIPT_DIR}/common.sh" STEP="b-side-omnid" blitz_load_boot_env +blitz_require_run_context blitz_require_executable "${OMNISOCKETGO_ROOT}/bin/b_side_omnid" "${STEP}" diff --git a/scripts/boot/start-ros-receiver-service.sh b/scripts/boot/start-ros-receiver-service.sh index 2541e66..8bea80a 100644 --- a/scripts/boot/start-ros-receiver-service.sh +++ b/scripts/boot/start-ros-receiver-service.sh @@ -8,6 +8,7 @@ source "${SCRIPT_DIR}/common.sh" STEP="ros-receiver" blitz_load_boot_env +blitz_require_run_context blitz_require_file "/opt/ros/${ROS_DISTRO}/setup.bash" "${STEP}" blitz_require_file "${ROS_CONTROL_PY_DIR}/install/setup.bash" "${STEP}" diff --git a/scripts/boot/systemd/blitz-5g-dial.service.in b/scripts/boot/systemd/blitz-5g-dial.service.in index a9ce2f0..02a5c64 100644 --- a/scripts/boot/systemd/blitz-5g-dial.service.in +++ b/scripts/boot/systemd/blitz-5g-dial.service.in @@ -1,7 +1,8 @@ [Unit] Description=Blitz robot 5G dial -After=blitz-boot-gate.service -Requires=blitz-boot-gate.service +PartOf=blitz-robot.target +After=blitz-run-context.service +Requires=blitz-run-context.service [Service] Type=oneshot diff --git a/scripts/boot/systemd/blitz-5g-link-logger.service.in b/scripts/boot/systemd/blitz-5g-link-logger.service.in new file mode 100644 index 0000000..436d03e --- /dev/null +++ b/scripts/boot/systemd/blitz-5g-link-logger.service.in @@ -0,0 +1,18 @@ +[Unit] +Description=Blitz robot 5G link logger +PartOf=blitz-robot.target +After=blitz-run-context.service blitz-5g-dial.service +Wants=blitz-run-context.service blitz-5g-dial.service + +[Service] +Type=simple +EnvironmentFile=-/run/blitz-robot/run-context.env +ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh +ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-5g-link-logger-service.sh +Restart=always +RestartSec=5 +StandardOutput=append:@BLITZ_LOG_FILE@ +StandardError=append:@BLITZ_LOG_FILE@ + +[Install] +WantedBy=blitz-robot.target diff --git a/scripts/boot/systemd/blitz-b-side-omnid.service.in b/scripts/boot/systemd/blitz-b-side-omnid.service.in index 4fdcc43..db5a938 100644 --- a/scripts/boot/systemd/blitz-b-side-omnid.service.in +++ b/scripts/boot/systemd/blitz-b-side-omnid.service.in @@ -1,12 +1,15 @@ [Unit] Description=Blitz robot b-side omnid -After=blitz-5g-dial.service blitz-ros-receiver.service -Wants=blitz-5g-dial.service blitz-ros-receiver.service +PartOf=blitz-robot.target +After=blitz-run-context.service blitz-5g-dial.service blitz-ros-receiver.service +Wants=blitz-run-context.service blitz-5g-dial.service blitz-ros-receiver.service [Service] Type=simple +EnvironmentFile=-/run/blitz-robot/run-context.env ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh +ExecStopPost=/bin/bash -lc 'if [[ "${SERVICE_RESULT:-success}" != "success" ]]; then exec "@OMNISOCKETGO_ROOT@/scripts/boot/blitz-incident-capture-launch.sh" --source exec-stop-post --unit "%n" --result "${SERVICE_RESULT:-}" --exit-status "${EXIT_STATUS:-}" --reason b-side-service-exit; fi' Restart=always RestartSec=2 StandardOutput=append:@BLITZ_LOG_FILE@ diff --git a/scripts/boot/systemd/blitz-boot-gate.service.in b/scripts/boot/systemd/blitz-boot-gate.service.in index 6c1b1ab..5f918ef 100644 --- a/scripts/boot/systemd/blitz-boot-gate.service.in +++ b/scripts/boot/systemd/blitz-boot-gate.service.in @@ -1,5 +1,6 @@ [Unit] Description=Blitz robot boot gate +PartOf=blitz-robot.target After=multi-user.target network-online.target Wants=network-online.target diff --git a/scripts/boot/systemd/blitz-robot.target.in b/scripts/boot/systemd/blitz-robot.target.in index f2a3136..7590c67 100644 --- a/scripts/boot/systemd/blitz-robot.target.in +++ b/scripts/boot/systemd/blitz-robot.target.in @@ -1,7 +1,9 @@ [Unit] Description=Blitz robot boot chain Wants=blitz-boot-gate.service +Wants=blitz-run-context.service Wants=blitz-5g-dial.service +Wants=blitz-5g-link-logger.service Wants=blitz-ros-receiver.service Wants=blitz-b-side-omnid.service Wants=blitz-watchdog.service diff --git a/scripts/boot/systemd/blitz-ros-receiver.service.in b/scripts/boot/systemd/blitz-ros-receiver.service.in index 437c136..71492f0 100644 --- a/scripts/boot/systemd/blitz-ros-receiver.service.in +++ b/scripts/boot/systemd/blitz-ros-receiver.service.in @@ -1,15 +1,18 @@ [Unit] Description=Blitz robot ROS receiver -After=blitz-5g-dial.service -Wants=blitz-5g-dial.service +PartOf=blitz-robot.target +After=blitz-run-context.service blitz-5g-dial.service +Wants=blitz-run-context.service blitz-5g-dial.service [Service] Type=simple User=@BLITZ_ROS_USER@ PermissionsStartOnly=true +EnvironmentFile=-/run/blitz-robot/run-context.env ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver +ExecStopPost=/bin/bash -lc 'if [[ "${SERVICE_RESULT:-success}" != "success" ]]; then exec "@OMNISOCKETGO_ROOT@/scripts/boot/blitz-incident-capture-launch.sh" --source exec-stop-post --unit "%n" --result "${SERVICE_RESULT:-}" --exit-status "${EXIT_STATUS:-}" --reason ros-service-exit; fi' Restart=always RestartSec=2 StandardOutput=append:@BLITZ_LOG_FILE@ diff --git a/scripts/boot/systemd/blitz-run-context.service.in b/scripts/boot/systemd/blitz-run-context.service.in new file mode 100644 index 0000000..2ace077 --- /dev/null +++ b/scripts/boot/systemd/blitz-run-context.service.in @@ -0,0 +1,15 @@ +[Unit] +Description=Blitz robot run context +PartOf=blitz-robot.target +After=blitz-boot-gate.service +Requires=blitz-boot-gate.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-run-context.sh +StandardOutput=append:@BLITZ_LOG_FILE@ +StandardError=append:@BLITZ_LOG_FILE@ + +[Install] +WantedBy=blitz-robot.target diff --git a/scripts/boot/systemd/blitz-watchdog.service.in b/scripts/boot/systemd/blitz-watchdog.service.in index ad246a9..23a9ffe 100644 --- a/scripts/boot/systemd/blitz-watchdog.service.in +++ b/scripts/boot/systemd/blitz-watchdog.service.in @@ -1,10 +1,12 @@ [Unit] Description=Blitz robot health watchdog -After=blitz-b-side-omnid.service blitz-ros-receiver.service -Wants=blitz-b-side-omnid.service blitz-ros-receiver.service +PartOf=blitz-robot.target +After=blitz-run-context.service blitz-b-side-omnid.service blitz-ros-receiver.service +Wants=blitz-run-context.service blitz-b-side-omnid.service blitz-ros-receiver.service [Service] Type=simple +EnvironmentFile=-/run/blitz-robot/run-context.env ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-watchdog.sh Restart=always diff --git a/scripts/dev/load-env.sh b/scripts/dev/load-env.sh index 306fdc8..f3357f9 100644 --- a/scripts/dev/load-env.sh +++ b/scripts/dev/load-env.sh @@ -134,3 +134,162 @@ export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR:-${ROBOT_SIDE_OMNISO export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}" export OMNI_CONTROL_UNIX_SOCKET_PATH="${OMNI_CONTROL_UNIX_SOCKET_PATH:-${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}}" export B_SIDE_OMNID_USE_SUDO="${B_SIDE_OMNID_USE_SUDO:-1}" +export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-${OMNISOCKETGO_ROOT}/logs/runtime}" +export BLITZ_RUN_ROOT="${BLITZ_RUN_ROOT:-${OMNISOCKETGO_ROOT}/logs}" +export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUN_CONTEXT_FILE:-${BLITZ_RUNTIME_DIR}/run-context.env}" +export BLITZ_RUN_ID_FILE="${BLITZ_RUN_ID_FILE:-${BLITZ_RUNTIME_DIR}/run-id}" +export BLITZ_CURRENT_RUN_LINK="${BLITZ_CURRENT_RUN_LINK:-${BLITZ_RUN_ROOT}/current}" +export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}" +export BLITZ_5G_MODEM_SUBNET="${BLITZ_5G_MODEM_SUBNET:-192.168.224.0/22}" +export BLITZ_5G_GATEWAY="${BLITZ_5G_GATEWAY:-192.168.225.1}" +export BLITZ_5G_ROUTE_TARGETS="${BLITZ_5G_ROUTE_TARGETS:-106.55.173.235}" +export BLITZ_5G_INFO_JSON="${BLITZ_5G_INFO_JSON:-${OMNISOCKETGO_ROOT}/scripts/boot/modem_network_info.json}" +export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-}" +export BLITZ_KCP_STATS_INTERVAL_MS="${BLITZ_KCP_STATS_INTERVAL_MS:-1000}" +export BLITZ_CONTROL_LATENCY_LOG_ENABLED="${BLITZ_CONTROL_LATENCY_LOG_ENABLED:-1}" +export BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="${BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD:-100}" +export BLITZ_5G_LINK_LOG_INTERVAL_SEC="${BLITZ_5G_LINK_LOG_INTERVAL_SEC:-5}" +export BLITZ_JSONL_FLUSH_INTERVAL_MS="${BLITZ_JSONL_FLUSH_INTERVAL_MS:-1000}" +export BLITZ_JSONL_FLUSH_BYTES="${BLITZ_JSONL_FLUSH_BYTES:-262144}" +export BLITZ_JSONL_ROTATE_BYTES="${BLITZ_JSONL_ROTATE_BYTES:-134217728}" +export BLITZ_JSONL_ROTATE_FILES="${BLITZ_JSONL_ROTATE_FILES:-8}" + +blitz_dev_utc_compact_timestamp() { + date -u '+%Y%m%dT%H%M%SZ' +} + +blitz_dev_git_commit() { + git -C "${OMNISOCKETGO_ROOT}" rev-parse HEAD 2>/dev/null || true +} + +blitz_dev_git_dirty_flag() { + if git -C "${OMNISOCKETGO_ROOT}" diff --quiet --ignore-submodules=dirty >/dev/null 2>&1; then + printf '0\n' + return 0 + fi + printf '1\n' +} + +blitz_dev_prepare_dirs() { + mkdir -p "${BLITZ_RUNTIME_DIR}" "${BLITZ_RUN_ROOT}/runs" "${BLITZ_RUN_ROOT}/incidents" +} + +blitz_dev_write_run_info() { + local run_dir="$1" + local run_id="$2" + local boot_id="$3" + local tmp_info="${run_dir}/run-info.json.tmp.$$" + local started_at + local commit_hash + local dirty_flag + + started_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + commit_hash="$(blitz_dev_git_commit)" + dirty_flag="$(blitz_dev_git_dirty_flag)" + + python3 - "${tmp_info}" "${run_id}" "${run_dir}" "${boot_id}" "${started_at}" "${commit_hash}" "${dirty_flag}" "${HOSTNAME:-$(hostname)}" <<'PY' +import json +import os +import sys + +path, run_id, run_dir, boot_id, started_at, commit_hash, dirty_flag, hostname = sys.argv[1:9] +payload = { + "run_id": run_id, + "run_dir": run_dir, + "boot_id": boot_id, + "started_at": started_at, + "hostname": hostname, + "git_commit": commit_hash, + "git_dirty": dirty_flag == "1", + "env": { + key: os.environ.get(key, "") + for key in sorted(os.environ) + if key.startswith(("BLITZ_", "OMNI_", "ROBOT_RECEIVER_")) + }, +} +with open(path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True) +PY + mv -f "${tmp_info}" "${run_dir}/run-info.json" +} + +blitz_dev_init_run_context() { + local run_id="${1:-$(blitz_dev_utc_compact_timestamp)}" + local boot_id="dev-$(blitz_dev_utc_compact_timestamp)" + local run_dir="${BLITZ_RUN_ROOT}/runs/${run_id}" + local tmp_context="${BLITZ_RUN_CONTEXT_FILE}.tmp.$$" + + blitz_dev_prepare_dirs + mkdir -p "${run_dir}" + export BLITZ_RUN_ID="${run_id}" + export BLITZ_RUN_DIR="${run_dir}" + export BLITZ_BOOT_ID="${boot_id}" + printf '%s\n' "${run_id}" > "${BLITZ_RUN_ID_FILE}" + cat > "${tmp_context}" </dev/null || true + wait "${logger_pid}" 2>/dev/null || true + fi +} + +start_5g_link_logger_if_needed() { + if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then + return 0 + fi + bash "${SCRIPT_DIR}/start-5g-link-logger.sh" & + logger_pid=$! + echo "[start-b-side-omnid] 5G link logger -> ${BLITZ_5G_LINK_LOG_PATH:-unset}" >&2 +} + if [[ ! -x "./bin/b_side_omnid" ]]; then if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then echo "Missing ./bin/b_side_omnid in boot mode; build it before enabling the autostart service." >&2 @@ -23,12 +42,14 @@ if [[ ! -x "./bin/b_side_omnid" ]]; then fi launch_b_side_omnid() { + trap cleanup EXIT INT TERM + start_5g_link_logger_if_needed bash "${SCRIPT_DIR}/apply-camera-controls.sh" - exec ./bin/b_side_omnid + ./bin/b_side_omnid } if [[ "${B_SIDE_OMNID_USE_SUDO}" == "1" && "${EUID}" -ne 0 ]]; then - exec sudo -E bash -lc 'cd "$1" && bash "$2" && exec "$3"' _ "${OMNISOCKETGO_ROOT}" "${SCRIPT_DIR}/apply-camera-controls.sh" "./bin/b_side_omnid" + exec sudo -E bash -lc 'cd "$1" && export B_SIDE_OMNID_USE_SUDO=0 && exec bash "$2"' _ "${OMNISOCKETGO_ROOT}" "${SCRIPT_DIR}/start-b-side-omnid.sh" fi launch_b_side_omnid diff --git a/scripts/dev/start-backend.sh b/scripts/dev/start-backend.sh index ec26e56..7698c95 100755 --- a/scripts/dev/start-backend.sh +++ b/scripts/dev/start-backend.sh @@ -5,6 +5,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC1091 source "${SCRIPT_DIR}/load-env.sh" require_robot_command_center_root +blitz_dev_prepare_backend_logging_env if [[ ! -d "${PYTHON_VENV_PATH}" ]]; then "${PYTHON3_BIN}" -m venv "${PYTHON_VENV_PATH}" diff --git a/src/kcp_packet_debug.c b/src/kcp_packet_debug.c index 33d958e..87540f8 100644 --- a/src/kcp_packet_debug.c +++ b/src/kcp_packet_debug.c @@ -18,7 +18,7 @@ kcp_packet_debug_logger_t *kcp_packet_debug_open_jsonl(const char *path) { fclose(file); return NULL; } - omni_file_logger_init(&logger->file_logger, file); + omni_file_logger_init_path(&logger->file_logger, file, path, 0); logger->enabled = 1; return logger; } diff --git a/src/kcp_session_stats.c b/src/kcp_session_stats.c index 10c328f..ad474cc 100644 --- a/src/kcp_session_stats.c +++ b/src/kcp_session_stats.c @@ -73,7 +73,7 @@ kcp_session_stats_logger_t *kcp_session_stats_open_jsonl(const char *path) { fclose(file); return NULL; } - omni_file_logger_init(&logger->file_logger, file); + omni_file_logger_init_path(&logger->file_logger, file, path, 0); logger->enabled = 1; return logger; } diff --git a/src/latencylog.c b/src/latencylog.c index 1c6ae7c..3a3314e 100644 --- a/src/latencylog.c +++ b/src/latencylog.c @@ -32,7 +32,7 @@ latency_logger_t *latencylog_open_jsonl(const char *path) { fclose(file); return NULL; } - omni_file_logger_init(&logger->file_logger, file); + omni_file_logger_init_path(&logger->file_logger, file, path, 0); logger->enabled = 1; return logger; } diff --git a/src/omni_common.c b/src/omni_common.c index 0c7c29c..fe2b1a8 100644 --- a/src/omni_common.c +++ b/src/omni_common.c @@ -544,9 +544,217 @@ const char *omni_path_base_name(const char *path) { return slash == NULL ? path : slash + 1; } +static uint64_t omni_now_monotonic_ms64(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t) ts.tv_sec * 1000ULL + (uint64_t) (ts.tv_nsec / 1000000L); +} + +static int omni_positive_int_env(const char *name, int default_value) { + const char *raw = getenv(name); + long parsed; + char *endptr = NULL; + + if (raw == NULL || raw[0] == '\0') { + return default_value; + } + parsed = strtol(raw, &endptr, 10); + if (endptr == raw || *endptr != '\0' || parsed <= 0) { + return default_value; + } + return (int) parsed; +} + +static size_t omni_positive_size_env(const char *name, size_t default_value) { + const char *raw = getenv(name); + unsigned long long parsed; + char *endptr = NULL; + + if (raw == NULL || raw[0] == '\0') { + return default_value; + } + parsed = strtoull(raw, &endptr, 10); + if (endptr == raw || *endptr != '\0' || parsed == 0ULL) { + return default_value; + } + return (size_t) parsed; +} + +static int omni_file_logger_flush_locked(omni_file_logger_t *logger, uint64_t now_ms) { + if (logger == NULL || logger->file == NULL) { + errno = EINVAL; + return -1; + } + if (fflush(logger->file) != 0) { + return -1; + } + logger->buffered_bytes = 0U; + logger->last_flush_monotonic_ms = now_ms; + return 0; +} + +static int omni_build_rotated_path(char *buffer, size_t buffer_len, const char *path, int suffix) { + size_t path_len; + int written; + + if (buffer == NULL || buffer_len == 0U || path == NULL || path[0] == '\0') { + errno = EINVAL; + return -1; + } + path_len = strlen(path); + if (path_len + 16U >= buffer_len) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(buffer, path, path_len); + written = snprintf(buffer + path_len, buffer_len - path_len, ".%d", suffix); + if (written < 0 || (size_t) written >= buffer_len - path_len) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +static int omni_file_logger_reopen_append_locked(omni_file_logger_t *logger) { + struct stat st; + FILE *file; + + if (logger == NULL || logger->path[0] == '\0') { + errno = EINVAL; + return -1; + } + + file = fopen(logger->path, "ab"); + if (file == NULL) { + return -1; + } + + logger->file = file; + logger->current_bytes = 0U; + if (stat(logger->path, &st) == 0) { + logger->current_bytes = (size_t) st.st_size; + } + logger->buffered_bytes = 0U; + logger->last_flush_monotonic_ms = omni_now_monotonic_ms64(); + return 0; +} + +static int omni_file_logger_recover_after_rotate_locked(omni_file_logger_t *logger, const char *rotated_current_path) { + int reopen_errno; + + if (omni_file_logger_reopen_append_locked(logger) == 0) { + return 0; + } + + reopen_errno = errno; + if (rotated_current_path != NULL && rotated_current_path[0] != '\0') { + if (rename(rotated_current_path, logger->path) == 0) { + if (omni_file_logger_reopen_append_locked(logger) == 0) { + return 0; + } + } + } + + errno = reopen_errno; + return -1; +} + +static int omni_file_logger_rotate_locked(omni_file_logger_t *logger) { + int index; + int saved_errno = 0; + int should_recover = 0; + char rotated_current_path[PATH_MAX]; + char from_path[PATH_MAX]; + char to_path[PATH_MAX]; + + if (logger == NULL || logger->path[0] == '\0' || logger->max_bytes == 0U || logger->max_files <= 0) { + return 0; + } + rotated_current_path[0] = '\0'; + if (logger->file != NULL) { + if (omni_file_logger_flush_locked(logger, omni_now_monotonic_ms64()) != 0) { + return -1; + } + should_recover = 1; + if (fclose(logger->file) != 0) { + logger->file = NULL; + saved_errno = errno; + goto recover; + } + logger->file = NULL; + } + + if (omni_build_rotated_path(from_path, sizeof(from_path), logger->path, logger->max_files) != 0) { + saved_errno = errno; + goto recover; + } + unlink(from_path); + for (index = logger->max_files - 1; index >= 1; --index) { + if (omni_build_rotated_path(from_path, sizeof(from_path), logger->path, index) != 0 || + omni_build_rotated_path(to_path, sizeof(to_path), logger->path, index + 1) != 0) { + saved_errno = errno; + goto recover; + } + if (rename(from_path, to_path) != 0 && errno != ENOENT) { + saved_errno = errno; + goto recover; + } + } + if (omni_build_rotated_path(to_path, sizeof(to_path), logger->path, 1) != 0) { + saved_errno = errno; + goto recover; + } + if (rename(logger->path, to_path) != 0 && errno != ENOENT) { + saved_errno = errno; + goto recover; + } + snprintf(rotated_current_path, sizeof(rotated_current_path), "%s", to_path); + + if (omni_file_logger_reopen_append_locked(logger) != 0) { + saved_errno = errno; + goto recover; + } + return 0; + +recover: + if (should_recover) { + int recover_errno = saved_errno != 0 ? saved_errno : errno; + if (omni_file_logger_recover_after_rotate_locked(logger, rotated_current_path) == 0) { + errno = recover_errno; + } else if (saved_errno != 0) { + errno = saved_errno; + } + } else if (saved_errno != 0) { + errno = saved_errno; + } + return -1; +} + void omni_file_logger_init(omni_file_logger_t *logger, FILE *file) { + memset(logger, 0, sizeof(*logger)); logger->file = file; pthread_mutex_init(&logger->mutex, NULL); + logger->flush_bytes = 1U; + logger->flush_interval_ms = 0; + logger->immediate_flush = 1; + logger->last_flush_monotonic_ms = omni_now_monotonic_ms64(); +} + +void omni_file_logger_init_path(omni_file_logger_t *logger, FILE *file, const char *path, int immediate_flush) { + struct stat st; + + omni_file_logger_init(logger, file); + if (path != NULL && path[0] != '\0') { + snprintf(logger->path, sizeof(logger->path), "%s", path); + if (stat(path, &st) == 0) { + logger->current_bytes = (size_t) st.st_size; + } + } + logger->flush_bytes = omni_positive_size_env("BLITZ_JSONL_FLUSH_BYTES", 262144U); + logger->flush_interval_ms = omni_positive_int_env("BLITZ_JSONL_FLUSH_INTERVAL_MS", 1000); + logger->max_bytes = omni_positive_size_env("BLITZ_JSONL_ROTATE_BYTES", 134217728U); + logger->max_files = omni_positive_int_env("BLITZ_JSONL_ROTATE_FILES", 8); + logger->immediate_flush = immediate_flush != 0; } void omni_file_logger_destroy(omni_file_logger_t *logger) { @@ -555,13 +763,32 @@ void omni_file_logger_destroy(omni_file_logger_t *logger) { int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line) { int rc = 0; + size_t line_len; + uint64_t now_ms; if (logger == NULL || logger->file == NULL || line == NULL) { errno = EINVAL; return -1; } + line_len = strlen(line) + 1U; + now_ms = omni_now_monotonic_ms64(); pthread_mutex_lock(&logger->mutex); - if (fputs(line, logger->file) == EOF || fputc('\n', logger->file) == EOF || fflush(logger->file) != 0) { + if (fputs(line, logger->file) == EOF || fputc('\n', logger->file) == EOF) { rc = -1; + } else { + logger->current_bytes += line_len; + logger->buffered_bytes += line_len; + if (logger->immediate_flush || + logger->buffered_bytes >= logger->flush_bytes || + (logger->flush_interval_ms > 0 && now_ms - logger->last_flush_monotonic_ms >= (uint64_t) logger->flush_interval_ms)) { + if (omni_file_logger_flush_locked(logger, now_ms) != 0) { + rc = -1; + } + } + if (rc == 0 && logger->max_bytes > 0U && logger->current_bytes >= logger->max_bytes) { + if (omni_file_logger_rotate_locked(logger) != 0) { + rc = -1; + } + } } pthread_mutex_unlock(&logger->mutex); return rc; diff --git a/src/tx_timestamp_debug.c b/src/tx_timestamp_debug.c index 77f1bef..3cdf19c 100644 --- a/src/tx_timestamp_debug.c +++ b/src/tx_timestamp_debug.c @@ -18,7 +18,7 @@ tx_timestamp_debug_logger_t *tx_timestamp_debug_open_jsonl(const char *path) { fclose(file); return NULL; } - omni_file_logger_init(&logger->file_logger, file); + omni_file_logger_init_path(&logger->file_logger, file, path, 0); logger->enabled = 1; return logger; } diff --git a/src/video_pipeline.c b/src/video_pipeline.c index dffcf52..cbbb276 100644 --- a/src/video_pipeline.c +++ b/src/video_pipeline.c @@ -213,6 +213,8 @@ void video_pipeline_config_init(video_pipeline_config_t *config) { config->hard_backpressure_hold_ms = VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT; config->server_idle_reconnect_ms = VIDEO_DEFAULT_SERVER_IDLE_RECONNECT_MS; config->frame_stall_reconnect_ms = VIDEO_DEFAULT_FRAME_STALL_RECONNECT_MS; + config->stats_logger = NULL; + config->stats_interval_ms = 1000; } void video_pipeline_config_load_env(video_pipeline_config_t *config) { @@ -235,6 +237,7 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config) { config->hard_backpressure_hold_ms = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS", config->hard_backpressure_hold_ms); config->server_idle_reconnect_ms = env_int_or_default("OMNI_VIDEO_SERVER_IDLE_RECONNECT_MS", config->server_idle_reconnect_ms); config->frame_stall_reconnect_ms = env_int_or_default("OMNI_VIDEO_FRAME_STALL_RECONNECT_MS", config->frame_stall_reconnect_ms); + config->stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", config->stats_interval_ms); } int video_pipeline_stats_init(video_pipeline_stats_t *stats) { @@ -600,8 +603,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config &options, NULL, NULL, - NULL, - KCP_DEFAULT_STATS_INTERVAL_MS + config->stats_logger, + config->stats_interval_ms ); if (sender->client == NULL) { return -1; diff --git a/src/video_pipeline_gps.c b/src/video_pipeline_gps.c index 13933a9..e60d6ef 100644 --- a/src/video_pipeline_gps.c +++ b/src/video_pipeline_gps.c @@ -186,6 +186,8 @@ void video_pipeline_config_init(video_pipeline_config_t *config) { config->output_height = VIDEO_OUTPUT_HEIGHT_DEFAULT; config->max_frames = 0; config->enable_timing_logs = 0; + config->stats_logger = NULL; + config->stats_interval_ms = 1000; } void video_pipeline_config_load_env(video_pipeline_config_t *config) { @@ -203,6 +205,7 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config) { config->max_frames = atoi(getenv("OMNI_VIDEO_MAX_FRAMES")); } config->enable_timing_logs = env_flag_or_default("OMNI_VIDEO_DEBUG_TIMING", config->enable_timing_logs); + config->stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", config->stats_interval_ms); } int video_pipeline_stats_init(video_pipeline_stats_t *stats) { @@ -564,8 +567,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config &options, NULL, NULL, - NULL, - KCP_DEFAULT_STATS_INTERVAL_MS + config->stats_logger, + config->stats_interval_ms ); if (sender->client == NULL) { return -1;