feat: 日志增强功能
This commit is contained in:
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
#include "cJSON.h"
|
#include "cJSON.h"
|
||||||
#include "control_protocol.h"
|
#include "control_protocol.h"
|
||||||
|
#include "latencylog.h"
|
||||||
#include "protocol.h"
|
#include "protocol.h"
|
||||||
#include "video_pipeline.h"
|
#include "video_pipeline.h"
|
||||||
|
|
||||||
@@ -26,6 +27,8 @@
|
|||||||
#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall"
|
#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall"
|
||||||
#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall"
|
#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall"
|
||||||
#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15
|
#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15
|
||||||
|
#define DEFAULT_KCP_STATS_INTERVAL_MS 1000
|
||||||
|
#define DEFAULT_CONTROL_LATENCY_SAMPLE_MOD 100
|
||||||
#define EXIT_CODE_VIDEO_THREAD_STALLED 101
|
#define EXIT_CODE_VIDEO_THREAD_STALLED 101
|
||||||
#define EXIT_CODE_CONTROL_THREAD_STALLED 102
|
#define EXIT_CODE_CONTROL_THREAD_STALLED 102
|
||||||
|
|
||||||
@@ -65,11 +68,15 @@ typedef struct daemon_state {
|
|||||||
int control_server_idle_reconnect_ms;
|
int control_server_idle_reconnect_ms;
|
||||||
const char *runtime_dir;
|
const char *runtime_dir;
|
||||||
int heartbeat_timeout_sec;
|
int heartbeat_timeout_sec;
|
||||||
|
int stats_interval_ms;
|
||||||
|
uint64_t control_latency_sample_mod;
|
||||||
char status_file_path[512];
|
char status_file_path[512];
|
||||||
char video_thread_fault_file[512];
|
char video_thread_fault_file[512];
|
||||||
char control_thread_fault_file[512];
|
char control_thread_fault_file[512];
|
||||||
atomic_long video_thread_heartbeat_epoch_sec;
|
atomic_long video_thread_heartbeat_epoch_sec;
|
||||||
atomic_long control_thread_heartbeat_epoch_sec;
|
atomic_long control_thread_heartbeat_epoch_sec;
|
||||||
|
kcp_session_stats_logger_t *stats_logger;
|
||||||
|
latency_logger_t *control_latency_logger;
|
||||||
unix_dgram_client_t unix_client;
|
unix_dgram_client_t unix_client;
|
||||||
control_bridge_stats_t control_stats;
|
control_bridge_stats_t control_stats;
|
||||||
} daemon_state_t;
|
} daemon_state_t;
|
||||||
@@ -127,6 +134,21 @@ static int env_int_or_default(const char *name, int fallback) {
|
|||||||
return parsed;
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint64_t env_u64_or_default(const char *name, uint64_t fallback) {
|
||||||
|
const char *value = getenv(name);
|
||||||
|
unsigned long long parsed = 0ULL;
|
||||||
|
char *endptr = NULL;
|
||||||
|
|
||||||
|
if (value == NULL || value[0] == '\0') {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
parsed = strtoull(value, &endptr, 10);
|
||||||
|
if (endptr == value || *endptr != '\0' || parsed == 0ULL) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
return (uint64_t) parsed;
|
||||||
|
}
|
||||||
|
|
||||||
static int64_t realtime_epoch_ms(void) {
|
static int64_t realtime_epoch_ms(void) {
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
|
|
||||||
@@ -145,6 +167,19 @@ static void update_thread_heartbeat(atomic_long *heartbeat) {
|
|||||||
atomic_store(heartbeat, realtime_epoch_sec());
|
atomic_store(heartbeat, realtime_epoch_sec());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int should_log_control_latency(const daemon_state_t *state, const message_t *msg) {
|
||||||
|
uint64_t sample_mod;
|
||||||
|
|
||||||
|
if (state == NULL || state->control_latency_logger == NULL || msg == NULL) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
sample_mod = state->control_latency_sample_mod;
|
||||||
|
if (sample_mod <= 1U) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return msg->id % sample_mod == 0U;
|
||||||
|
}
|
||||||
|
|
||||||
static void video_pipeline_heartbeat_progress(void *context) {
|
static void video_pipeline_heartbeat_progress(void *context) {
|
||||||
update_thread_heartbeat((atomic_long *) context);
|
update_thread_heartbeat((atomic_long *) context);
|
||||||
}
|
}
|
||||||
@@ -556,8 +591,8 @@ static void *control_thread_main(void *arg) {
|
|||||||
&options,
|
&options,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
state->stats_logger,
|
||||||
KCP_DEFAULT_STATS_INTERVAL_MS
|
state->stats_interval_ms
|
||||||
);
|
);
|
||||||
if (client == NULL) {
|
if (client == NULL) {
|
||||||
control_bridge_set_errno_error(&state->control_stats, "failed to connect control session");
|
control_bridge_set_errno_error(&state->control_stats, "failed to connect control session");
|
||||||
@@ -692,6 +727,11 @@ static void *control_thread_main(void *arg) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (should_log_control_latency(state, &msg)) {
|
||||||
|
latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_APP_RECV, &msg);
|
||||||
|
latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_BEGIN, &msg);
|
||||||
|
}
|
||||||
|
|
||||||
if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) {
|
if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) {
|
||||||
int send_errno = errno;
|
int send_errno = errno;
|
||||||
int recovered = 0;
|
int recovered = 0;
|
||||||
@@ -708,6 +748,9 @@ static void *control_thread_main(void *arg) {
|
|||||||
state->control_stats.server_idle_ms = client_state.server_idle_ms;
|
state->control_stats.server_idle_ms = client_state.server_idle_ms;
|
||||||
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
|
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
|
||||||
pthread_mutex_unlock(&state->control_stats.mutex);
|
pthread_mutex_unlock(&state->control_stats.mutex);
|
||||||
|
if (should_log_control_latency(state, &msg)) {
|
||||||
|
latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_END, &msg);
|
||||||
|
}
|
||||||
protocol_message_clear(&msg);
|
protocol_message_clear(&msg);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -728,6 +771,9 @@ static void *control_thread_main(void *arg) {
|
|||||||
state->control_stats.server_idle_ms = client_state.server_idle_ms;
|
state->control_stats.server_idle_ms = client_state.server_idle_ms;
|
||||||
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
|
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
|
||||||
pthread_mutex_unlock(&state->control_stats.mutex);
|
pthread_mutex_unlock(&state->control_stats.mutex);
|
||||||
|
if (should_log_control_latency(state, &msg)) {
|
||||||
|
latencylog_log_message_event(state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_END, &msg);
|
||||||
|
}
|
||||||
protocol_message_clear(&msg);
|
protocol_message_clear(&msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -799,8 +845,12 @@ int main(void) {
|
|||||||
"BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC",
|
"BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC",
|
||||||
DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC
|
DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC
|
||||||
);
|
);
|
||||||
|
state.stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", DEFAULT_KCP_STATS_INTERVAL_MS);
|
||||||
|
state.control_latency_sample_mod = env_u64_or_default("BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD", DEFAULT_CONTROL_LATENCY_SAMPLE_MOD);
|
||||||
state.video_config.progress_callback = video_pipeline_heartbeat_progress;
|
state.video_config.progress_callback = video_pipeline_heartbeat_progress;
|
||||||
state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec;
|
state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec;
|
||||||
|
state.video_config.stats_logger = NULL;
|
||||||
|
state.video_config.stats_interval_ms = state.stats_interval_ms;
|
||||||
state.control_server_idle_reconnect_ms = env_int_or_default(
|
state.control_server_idle_reconnect_ms = env_int_or_default(
|
||||||
"OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS",
|
"OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS",
|
||||||
CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS
|
CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS
|
||||||
@@ -860,11 +910,34 @@ int main(void) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const char *stats_log_path = getenv("BLITZ_KCP_STATS_LOG_PATH");
|
||||||
|
const char *latency_log_path = getenv("BLITZ_CONTROL_LATENCY_LOG_PATH");
|
||||||
|
int latency_enabled = env_int_or_default("BLITZ_CONTROL_LATENCY_LOG_ENABLED", 1);
|
||||||
|
|
||||||
|
if (stats_log_path != NULL && stats_log_path[0] != '\0') {
|
||||||
|
state.stats_logger = kcp_session_stats_open_jsonl(stats_log_path);
|
||||||
|
if (state.stats_logger == NULL) {
|
||||||
|
fprintf(stderr, "[b_side_omnid] warning: failed to open KCP stats log %s\n", stats_log_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (latency_enabled && latency_log_path != NULL && latency_log_path[0] != '\0') {
|
||||||
|
state.control_latency_logger = latencylog_open_jsonl(latency_log_path);
|
||||||
|
if (state.control_latency_logger == NULL) {
|
||||||
|
fprintf(stderr, "[b_side_omnid] warning: failed to open control latency log %s\n", latency_log_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state.video_config.stats_logger = state.stats_logger;
|
||||||
|
state.video_config.stats_interval_ms = state.stats_interval_ms;
|
||||||
|
}
|
||||||
|
|
||||||
if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) {
|
if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) {
|
||||||
perror("pthread_create(video_thread)");
|
perror("pthread_create(video_thread)");
|
||||||
unix_dgram_client_close(&state.unix_client);
|
unix_dgram_client_close(&state.unix_client);
|
||||||
control_bridge_stats_destroy(&state.control_stats);
|
control_bridge_stats_destroy(&state.control_stats);
|
||||||
video_pipeline_stats_destroy(&state.video_stats);
|
video_pipeline_stats_destroy(&state.video_stats);
|
||||||
|
latencylog_close(state.control_latency_logger);
|
||||||
|
kcp_session_stats_close(state.stats_logger);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) {
|
if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) {
|
||||||
@@ -874,6 +947,8 @@ int main(void) {
|
|||||||
unix_dgram_client_close(&state.unix_client);
|
unix_dgram_client_close(&state.unix_client);
|
||||||
control_bridge_stats_destroy(&state.control_stats);
|
control_bridge_stats_destroy(&state.control_stats);
|
||||||
video_pipeline_stats_destroy(&state.video_stats);
|
video_pipeline_stats_destroy(&state.video_stats);
|
||||||
|
latencylog_close(state.control_latency_logger);
|
||||||
|
kcp_session_stats_close(state.stats_logger);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -891,5 +966,7 @@ int main(void) {
|
|||||||
unix_dgram_client_close(&state.unix_client);
|
unix_dgram_client_close(&state.unix_client);
|
||||||
control_bridge_stats_destroy(&state.control_stats);
|
control_bridge_stats_destroy(&state.control_stats);
|
||||||
video_pipeline_stats_destroy(&state.video_stats);
|
video_pipeline_stats_destroy(&state.video_stats);
|
||||||
|
latencylog_close(state.control_latency_logger);
|
||||||
|
kcp_session_stats_close(state.stats_logger);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
|
#include <limits.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
@@ -31,6 +32,15 @@
|
|||||||
typedef struct omni_file_logger {
|
typedef struct omni_file_logger {
|
||||||
FILE *file;
|
FILE *file;
|
||||||
pthread_mutex_t mutex;
|
pthread_mutex_t mutex;
|
||||||
|
char path[PATH_MAX];
|
||||||
|
size_t current_bytes;
|
||||||
|
size_t buffered_bytes;
|
||||||
|
size_t flush_bytes;
|
||||||
|
size_t max_bytes;
|
||||||
|
int flush_interval_ms;
|
||||||
|
int max_files;
|
||||||
|
int immediate_flush;
|
||||||
|
uint64_t last_flush_monotonic_ms;
|
||||||
} omni_file_logger_t;
|
} omni_file_logger_t;
|
||||||
|
|
||||||
int64_t omni_now_unix_nano(void);
|
int64_t omni_now_unix_nano(void);
|
||||||
@@ -61,6 +71,7 @@ double omni_duration_ms_to_ns(double ms);
|
|||||||
const char *omni_path_base_name(const char *path);
|
const char *omni_path_base_name(const char *path);
|
||||||
|
|
||||||
void omni_file_logger_init(omni_file_logger_t *logger, FILE *file);
|
void omni_file_logger_init(omni_file_logger_t *logger, FILE *file);
|
||||||
|
void omni_file_logger_init_path(omni_file_logger_t *logger, FILE *file, const char *path, int immediate_flush);
|
||||||
void omni_file_logger_destroy(omni_file_logger_t *logger);
|
void omni_file_logger_destroy(omni_file_logger_t *logger);
|
||||||
int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line);
|
int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line);
|
||||||
|
|
||||||
|
|||||||
@@ -43,6 +43,8 @@ typedef struct video_pipeline_config {
|
|||||||
int hard_backpressure_hold_ms;
|
int hard_backpressure_hold_ms;
|
||||||
int server_idle_reconnect_ms;
|
int server_idle_reconnect_ms;
|
||||||
int frame_stall_reconnect_ms;
|
int frame_stall_reconnect_ms;
|
||||||
|
kcp_session_stats_logger_t *stats_logger;
|
||||||
|
int stats_interval_ms;
|
||||||
video_pipeline_progress_fn progress_callback;
|
video_pipeline_progress_fn progress_callback;
|
||||||
void *progress_context;
|
void *progress_context;
|
||||||
} video_pipeline_config_t;
|
} video_pipeline_config_t;
|
||||||
|
|||||||
137
scripts/boot/blitz-5g-link-logger.sh
Normal file
137
scripts/boot/blitz-5g-link-logger.sh
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/common.sh"
|
||||||
|
|
||||||
|
STEP="5g-link-logger"
|
||||||
|
|
||||||
|
resolve_target_ip() {
|
||||||
|
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
|
||||||
|
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
for candidate in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
|
||||||
|
if [[ -n "${candidate}" ]]; then
|
||||||
|
printf '%s\n' "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
emit_sample_json() {
|
||||||
|
local interface_name="${1:-}"
|
||||||
|
local target_ip="${2:-}"
|
||||||
|
|
||||||
|
python3 - "${interface_name}" "${target_ip}" <<'PY'
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
interface_name = sys.argv[1]
|
||||||
|
target_ip = sys.argv[2]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"ts_unix_ms": time.time_ns() // 1_000_000,
|
||||||
|
"interface": interface_name,
|
||||||
|
"target_ip": target_ip,
|
||||||
|
"link_present": False,
|
||||||
|
"route_output": "",
|
||||||
|
"route_ok": False,
|
||||||
|
"probe_ok": False,
|
||||||
|
"ping_rtt_ms": None,
|
||||||
|
"rx_bytes": 0,
|
||||||
|
"tx_bytes": 0,
|
||||||
|
"rx_packets": 0,
|
||||||
|
"tx_packets": 0,
|
||||||
|
"rx_errors": 0,
|
||||||
|
"tx_errors": 0,
|
||||||
|
"rx_drops": 0,
|
||||||
|
"tx_drops": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
if interface_name:
|
||||||
|
try:
|
||||||
|
output = subprocess.check_output(
|
||||||
|
["ip", "-j", "-s", "link", "show", "dev", interface_name],
|
||||||
|
text=True,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
stats = json.loads(output)
|
||||||
|
if stats:
|
||||||
|
item = stats[0]
|
||||||
|
payload["link_present"] = True
|
||||||
|
rx = item.get("stats64", {}).get("rx", {})
|
||||||
|
tx = item.get("stats64", {}).get("tx", {})
|
||||||
|
if not rx and not tx:
|
||||||
|
rx = item.get("stats", {}).get("rx", {})
|
||||||
|
tx = item.get("stats", {}).get("tx", {})
|
||||||
|
payload["rx_bytes"] = int(rx.get("bytes") or 0)
|
||||||
|
payload["tx_bytes"] = int(tx.get("bytes") or 0)
|
||||||
|
payload["rx_packets"] = int(rx.get("packets") or 0)
|
||||||
|
payload["tx_packets"] = int(tx.get("packets") or 0)
|
||||||
|
payload["rx_errors"] = int(rx.get("errors") or 0)
|
||||||
|
payload["tx_errors"] = int(tx.get("errors") or 0)
|
||||||
|
payload["rx_drops"] = int(rx.get("dropped") or 0)
|
||||||
|
payload["tx_drops"] = int(tx.get("dropped") or 0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if target_ip:
|
||||||
|
try:
|
||||||
|
route = subprocess.check_output(
|
||||||
|
["ip", "route", "get", target_ip],
|
||||||
|
text=True,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
).strip()
|
||||||
|
payload["route_output"] = route.splitlines()[0] if route else ""
|
||||||
|
payload["route_ok"] = bool(payload["route_output"]) and (
|
||||||
|
not interface_name or f" dev {interface_name}" in payload["route_output"]
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
payload["route_output"] = str(exc)
|
||||||
|
|
||||||
|
ping_cmd = ["ping", "-c", "1", "-W", "2", target_ip]
|
||||||
|
if interface_name:
|
||||||
|
ping_cmd[1:1] = ["-I", interface_name]
|
||||||
|
ping = subprocess.run(ping_cmd, capture_output=True, text=True)
|
||||||
|
payload["probe_ok"] = ping.returncode == 0
|
||||||
|
output = (ping.stdout or "") + "\n" + (ping.stderr or "")
|
||||||
|
for token in output.replace("\n", " ").split():
|
||||||
|
if token.startswith("time="):
|
||||||
|
value = token.split("=", 1)[1].rstrip("ms")
|
||||||
|
try:
|
||||||
|
payload["ping_rtt_ms"] = float(value)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
|
||||||
|
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
|
||||||
|
blitz_load_boot_env
|
||||||
|
blitz_require_run_context
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "${BLITZ_RUN_DIR:-}" && -f "${BLITZ_RUN_CONTEXT_FILE:-}" ]]; then
|
||||||
|
blitz_load_run_context_env || true
|
||||||
|
fi
|
||||||
|
blitz_ensure_instance_id
|
||||||
|
|
||||||
|
export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-${BLITZ_RUN_DIR}/b-5g-link-quality.${BLITZ_INSTANCE_ID}.jsonl}"
|
||||||
|
target_ip="$(resolve_target_ip || true)"
|
||||||
|
|
||||||
|
blitz_log "${STEP}" "start" "start" "path=${BLITZ_5G_LINK_LOG_PATH} interval_sec=${BLITZ_5G_LINK_LOG_INTERVAL_SEC}" 0
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
interface_name="$(blitz_resolve_5g_interface || true)"
|
||||||
|
line="$(emit_sample_json "${interface_name}" "${target_ip}")"
|
||||||
|
blitz_jsonl_append_line "${BLITZ_5G_LINK_LOG_PATH}" "${line}"
|
||||||
|
sleep "${BLITZ_5G_LINK_LOG_INTERVAL_SEC}"
|
||||||
|
done
|
||||||
50
scripts/boot/blitz-incident-capture-launch.sh
Normal file
50
scripts/boot/blitz-incident-capture-launch.sh
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/common.sh"
|
||||||
|
|
||||||
|
STEP="incident-launch"
|
||||||
|
incident_id=""
|
||||||
|
args=()
|
||||||
|
timeout_bin=""
|
||||||
|
|
||||||
|
while (($# > 0)); do
|
||||||
|
case "$1" in
|
||||||
|
--incident-id)
|
||||||
|
incident_id="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
args+=("$1")
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
blitz_load_boot_env
|
||||||
|
blitz_require_root "${STEP}"
|
||||||
|
blitz_require_command systemd-run "${STEP}"
|
||||||
|
blitz_require_command timeout "${STEP}"
|
||||||
|
timeout_bin="$(command -v timeout)"
|
||||||
|
|
||||||
|
if [[ -z "${incident_id}" ]]; then
|
||||||
|
incident_id="$(blitz_new_incident_id)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
unit_name="blitz-incident-${incident_id//[^A-Za-z0-9_.-]/-}"
|
||||||
|
|
||||||
|
systemd-run \
|
||||||
|
--quiet \
|
||||||
|
--collect \
|
||||||
|
--unit "${unit_name}" \
|
||||||
|
--property=Type=oneshot \
|
||||||
|
--property="StandardOutput=append:${BLITZ_LOG_FILE}" \
|
||||||
|
--property="StandardError=append:${BLITZ_LOG_FILE}" \
|
||||||
|
"${timeout_bin}" "${BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC}s" \
|
||||||
|
/bin/bash "${SCRIPT_DIR}/blitz-incident-capture.sh" \
|
||||||
|
--incident-id "${incident_id}" \
|
||||||
|
"${args[@]}"
|
||||||
|
|
||||||
|
printf '%s\n' "${incident_id}"
|
||||||
131
scripts/boot/blitz-incident-capture.sh
Normal file
131
scripts/boot/blitz-incident-capture.sh
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/common.sh"
|
||||||
|
|
||||||
|
STEP="incident-capture"
|
||||||
|
incident_id=""
|
||||||
|
incident_source=""
|
||||||
|
incident_reason=""
|
||||||
|
incident_unit=""
|
||||||
|
incident_result=""
|
||||||
|
incident_exit_status=""
|
||||||
|
|
||||||
|
run_capture() {
|
||||||
|
local output_path="$1"
|
||||||
|
shift
|
||||||
|
|
||||||
|
if command -v timeout >/dev/null 2>&1; then
|
||||||
|
timeout "${BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC}s" "$@" > "${output_path}" 2>&1 || true
|
||||||
|
else
|
||||||
|
"$@" > "${output_path}" 2>&1 || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
while (($# > 0)); do
|
||||||
|
case "$1" in
|
||||||
|
--incident-id)
|
||||||
|
incident_id="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--source)
|
||||||
|
incident_source="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--reason)
|
||||||
|
incident_reason="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--unit)
|
||||||
|
incident_unit="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--result)
|
||||||
|
incident_result="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--exit-status)
|
||||||
|
incident_exit_status="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
blitz_log "${STEP}" "parse-arg" "failure" "unknown argument: $1" 2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -n "${incident_result}" && "${incident_result}" == "success" ]]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
blitz_load_boot_env
|
||||||
|
blitz_load_run_context_env || true
|
||||||
|
blitz_prepare_runtime_dir
|
||||||
|
blitz_prepare_run_root
|
||||||
|
|
||||||
|
if [[ -z "${incident_id}" ]]; then
|
||||||
|
incident_id="$(blitz_new_incident_id)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
incident_dir="${BLITZ_RUN_ROOT}/incidents/${incident_id}"
|
||||||
|
mkdir -p "${incident_dir}"
|
||||||
|
|
||||||
|
python3 - "${incident_dir}/incident.json" "${incident_id}" "${BLITZ_RUN_ID:-}" "${incident_source}" "${incident_reason}" "${incident_unit}" "${incident_result}" "${incident_exit_status}" "${BLITZ_RUN_DIR:-}" "${HOSTNAME:-$(hostname)}" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
path, incident_id, run_id, source, reason, unit, result, exit_status, run_dir, hostname = sys.argv[1:10]
|
||||||
|
payload = {
|
||||||
|
"incident_id": incident_id,
|
||||||
|
"run_id": run_id,
|
||||||
|
"source": source,
|
||||||
|
"fault_reason": reason,
|
||||||
|
"unit": unit,
|
||||||
|
"service_result": result,
|
||||||
|
"exit_status": exit_status,
|
||||||
|
"run_dir": run_dir,
|
||||||
|
"hostname": hostname,
|
||||||
|
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||||
|
}
|
||||||
|
with open(path, "w", encoding="utf-8") as handle:
|
||||||
|
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
||||||
|
PY
|
||||||
|
|
||||||
|
for status_file in \
|
||||||
|
"${BLITZ_RUNTIME_DIR}/watchdog.status.json" \
|
||||||
|
"${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json" \
|
||||||
|
"${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
||||||
|
do
|
||||||
|
if [[ -f "${status_file}" ]]; then
|
||||||
|
cp -f "${status_file}" "${incident_dir}/$(basename "${status_file}")"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -f "${BLITZ_LOG_FILE}" ]]; then
|
||||||
|
tail -n 400 "${BLITZ_LOG_FILE}" > "${incident_dir}/startup.log.tail"
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_capture "${incident_dir}/systemctl-status.txt" \
|
||||||
|
systemctl status blitz-robot.target blitz-run-context.service blitz-5g-dial.service blitz-5g-link-logger.service blitz-ros-receiver.service blitz-b-side-omnid.service blitz-watchdog.service
|
||||||
|
run_capture "${incident_dir}/journal.txt" \
|
||||||
|
journalctl --no-pager --since "5 minutes ago" -u blitz-run-context.service -u blitz-5g-dial.service -u blitz-5g-link-logger.service -u blitz-ros-receiver.service -u blitz-b-side-omnid.service -u blitz-watchdog.service
|
||||||
|
run_capture "${incident_dir}/ip-addr.txt" ip addr
|
||||||
|
run_capture "${incident_dir}/ip-route.txt" ip route
|
||||||
|
run_capture "${incident_dir}/ss-uapn.txt" ss -uapn
|
||||||
|
run_capture "${incident_dir}/ss-xlp.txt" ss -xlp
|
||||||
|
|
||||||
|
if [[ -f "${BLITZ_5G_INFO_JSON:-}" ]]; then
|
||||||
|
cp -f "${BLITZ_5G_INFO_JSON}" "${incident_dir}/$(basename "${BLITZ_5G_INFO_JSON}")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${BLITZ_RUN_DIR:-}" && -d "${BLITZ_RUN_DIR}" ]]; then
|
||||||
|
while IFS= read -r -d '' jsonl; do
|
||||||
|
tail -n 200 "${jsonl}" > "${incident_dir}/tail-$(basename "${jsonl}")"
|
||||||
|
done < <(find "${BLITZ_RUN_DIR}" -maxdepth 1 -type f -name '*.jsonl' -print0 2>/dev/null)
|
||||||
|
fi
|
||||||
|
|
||||||
|
blitz_log "${STEP}" "complete" "success" "incident_id=${incident_id} path=${incident_dir}" 0
|
||||||
14
scripts/boot/blitz-run-context.sh
Normal file
14
scripts/boot/blitz-run-context.sh
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/common.sh"
|
||||||
|
|
||||||
|
STEP="run-context"
|
||||||
|
|
||||||
|
blitz_load_boot_env
|
||||||
|
blitz_require_root "${STEP}"
|
||||||
|
blitz_require_command python3 "${STEP}"
|
||||||
|
blitz_init_run_context
|
||||||
|
blitz_log "${STEP}" "complete" "success" "run_id=${BLITZ_RUN_ID} run_dir=${BLITZ_RUN_DIR}" 0
|
||||||
@@ -12,6 +12,10 @@ B_SIDE_STATUS_FILE=""
|
|||||||
ROS_STATUS_FILE=""
|
ROS_STATUS_FILE=""
|
||||||
WATCHDOG_STATUS_FILE=""
|
WATCHDOG_STATUS_FILE=""
|
||||||
NETWORK_FAULT_FILE=""
|
NETWORK_FAULT_FILE=""
|
||||||
|
WATCHDOG_EVENT_LOG=""
|
||||||
|
WATCHDOG_SAMPLE_LOG=""
|
||||||
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
|
||||||
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
|
||||||
CAMERA_MISSING_PREV=0
|
CAMERA_MISSING_PREV=0
|
||||||
CAMERA_RECOVERY_STABLE_COUNT=0
|
CAMERA_RECOVERY_STABLE_COUNT=0
|
||||||
NETWORK_FAIL_COUNT=0
|
NETWORK_FAIL_COUNT=0
|
||||||
@@ -28,6 +32,8 @@ GPS_LAST_CHECK_SEC=0
|
|||||||
GPS_DEVICE_PRESENT_PREV=-1
|
GPS_DEVICE_PRESENT_PREV=-1
|
||||||
GPS_DEVICE_PRESENT_STATE=1
|
GPS_DEVICE_PRESENT_STATE=1
|
||||||
GPS_STACK_ACTIVE_STATE=1
|
GPS_STACK_ACTIVE_STATE=1
|
||||||
|
LAST_REPORTED_FAULT_REASON=""
|
||||||
|
LAST_REPORTED_RECOVERY_STATE=""
|
||||||
declare -A TARGETED_RESTART_WINDOW_START=()
|
declare -A TARGETED_RESTART_WINDOW_START=()
|
||||||
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
declare -A TARGETED_RESTART_WINDOW_COUNT=()
|
||||||
|
|
||||||
@@ -236,6 +242,116 @@ EOF
|
|||||||
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
|
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
watchdog_emit_json() {
|
||||||
|
local record_type="$1"
|
||||||
|
local action="$2"
|
||||||
|
local fault_reason="$3"
|
||||||
|
local recovery_state="$4"
|
||||||
|
local detail="$5"
|
||||||
|
local incident_id="${6:-}"
|
||||||
|
local network_ok="${7:-1}"
|
||||||
|
local camera_ok="${8:-1}"
|
||||||
|
local ros_ok="${9:-1}"
|
||||||
|
local bside_ok="${10:-1}"
|
||||||
|
local gps_ok="${11:-1}"
|
||||||
|
local gps_device_present="${12:-1}"
|
||||||
|
|
||||||
|
python3 - "${record_type}" "${action}" "${fault_reason}" "${recovery_state}" "${detail}" "${incident_id}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" "${LAST_ACTION}" "${LAST_ACTION_EPOCH_MS}" "${NETWORK_FAIL_COUNT}" "$(targeted_restart_total)" "${FULL_RESTART_WINDOW_COUNT}" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
record_type, action, fault_reason, recovery_state, detail, incident_id, network_ok, camera_ok, ros_ok, bside_ok, gps_ok, gps_device_present, last_action, last_action_epoch_ms, network_fail_count, targeted_restart_count, full_restart_count = sys.argv[1:18]
|
||||||
|
payload = {
|
||||||
|
"ts_unix_ms": time.time_ns() // 1_000_000,
|
||||||
|
"record_type": record_type,
|
||||||
|
"action": action,
|
||||||
|
"fault_reason": fault_reason,
|
||||||
|
"recovery_state": recovery_state,
|
||||||
|
"detail": detail,
|
||||||
|
"incident_id": incident_id or None,
|
||||||
|
"network_ok": network_ok == "1",
|
||||||
|
"camera_ok": camera_ok == "1",
|
||||||
|
"ros_ok": ros_ok == "1",
|
||||||
|
"bside_ok": bside_ok == "1",
|
||||||
|
"gps_ok": gps_ok == "1",
|
||||||
|
"gps_device_present": gps_device_present == "1",
|
||||||
|
"network_fail_count": int(network_fail_count),
|
||||||
|
"targeted_restart_count": int(targeted_restart_count),
|
||||||
|
"full_restart_count": int(full_restart_count),
|
||||||
|
"last_action": last_action,
|
||||||
|
"last_action_epoch_ms": int(last_action_epoch_ms or 0),
|
||||||
|
}
|
||||||
|
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
watchdog_append_event() {
|
||||||
|
local line=""
|
||||||
|
|
||||||
|
[[ -n "${WATCHDOG_EVENT_LOG}" ]] || return 0
|
||||||
|
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
|
||||||
|
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
|
||||||
|
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true
|
||||||
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then
|
||||||
|
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
|
||||||
|
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true
|
||||||
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
|
||||||
|
}
|
||||||
|
|
||||||
|
watchdog_append_sample() {
|
||||||
|
local line=""
|
||||||
|
|
||||||
|
[[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0
|
||||||
|
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
|
||||||
|
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
|
||||||
|
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true
|
||||||
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then
|
||||||
|
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
|
||||||
|
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true
|
||||||
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
|
||||||
|
}
|
||||||
|
|
||||||
|
watchdog_record_state_transition() {
|
||||||
|
local fault_reason="$1"
|
||||||
|
local recovery_state="$2"
|
||||||
|
|
||||||
|
if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" ""
|
||||||
|
LAST_REPORTED_FAULT_REASON="${fault_reason}"
|
||||||
|
LAST_REPORTED_RECOVERY_STATE="${recovery_state}"
|
||||||
|
}
|
||||||
|
|
||||||
|
watchdog_launch_incident() {
|
||||||
|
local reason="$1"
|
||||||
|
local unit_name="$2"
|
||||||
|
|
||||||
|
"${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh" \
|
||||||
|
--source watchdog \
|
||||||
|
--reason "${reason}" \
|
||||||
|
--unit "${unit_name}" \
|
||||||
|
--result failure \
|
||||||
|
--exit-status 1 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
set_last_action() {
|
set_last_action() {
|
||||||
LAST_ACTION="$1"
|
LAST_ACTION="$1"
|
||||||
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
|
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
|
||||||
@@ -283,6 +399,7 @@ record_full_restart() {
|
|||||||
fi
|
fi
|
||||||
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
|
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
|
||||||
BACKOFF_UNTIL=$(( now_sec + 60 ))
|
BACKOFF_UNTIL=$(( now_sec + 60 ))
|
||||||
|
watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" ""
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -290,36 +407,45 @@ restart_bside_targeted() {
|
|||||||
local fault_key="$1"
|
local fault_key="$1"
|
||||||
local reason="$2"
|
local reason="$2"
|
||||||
local rc
|
local rc
|
||||||
|
local incident_id=""
|
||||||
|
|
||||||
if register_targeted_restart "${fault_key}"; then
|
if register_targeted_restart "${fault_key}"; then
|
||||||
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
|
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
|
||||||
|
watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" ""
|
||||||
full_restart_stack "${reason}-escalated"
|
full_restart_stack "${reason}-escalated"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")"
|
||||||
set_last_action "restart-bside"
|
set_last_action "restart-bside"
|
||||||
RECOVERY_ACTION_TAKEN=1
|
RECOVERY_ACTION_TAKEN=1
|
||||||
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
|
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
|
||||||
|
watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
|
||||||
if systemctl restart "${B_SIDE_SERVICE}"; then
|
if systemctl restart "${B_SIDE_SERVICE}"; then
|
||||||
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
|
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
|
||||||
|
watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rc=$?
|
rc=$?
|
||||||
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
|
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
|
||||||
|
watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}"
|
||||||
return "${rc}"
|
return "${rc}"
|
||||||
}
|
}
|
||||||
|
|
||||||
full_restart_stack() {
|
full_restart_stack() {
|
||||||
local reason="$1"
|
local reason="$1"
|
||||||
local rc
|
local rc
|
||||||
|
local incident_id=""
|
||||||
|
|
||||||
|
incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")"
|
||||||
set_last_action "full-restart"
|
set_last_action "full-restart"
|
||||||
RECOVERY_ACTION_TAKEN=1
|
RECOVERY_ACTION_TAKEN=1
|
||||||
recovery_state="recovering"
|
recovery_state="recovering"
|
||||||
fault_reason="${reason}"
|
fault_reason="${reason}"
|
||||||
|
|
||||||
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
|
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
|
||||||
|
watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}"
|
||||||
systemctl stop "${B_SIDE_SERVICE}" || true
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
||||||
|
|
||||||
if systemctl restart "${ROS_SERVICE}"; then
|
if systemctl restart "${ROS_SERVICE}"; then
|
||||||
@@ -345,9 +471,11 @@ full_restart_stack() {
|
|||||||
else
|
else
|
||||||
rc=$?
|
rc=$?
|
||||||
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
|
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
|
||||||
|
watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}"
|
||||||
record_full_restart
|
record_full_restart
|
||||||
return "${rc}"
|
return "${rc}"
|
||||||
fi
|
fi
|
||||||
|
watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}"
|
||||||
record_full_restart
|
record_full_restart
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -659,6 +787,7 @@ wait_for_network_recovery() {
|
|||||||
|
|
||||||
perform_network_recovery() {
|
perform_network_recovery() {
|
||||||
local rc=0
|
local rc=0
|
||||||
|
local incident_id=""
|
||||||
|
|
||||||
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
|
||||||
set_last_action "route-repair"
|
set_last_action "route-repair"
|
||||||
@@ -666,12 +795,15 @@ perform_network_recovery() {
|
|||||||
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
||||||
NETWORK_FAIL_COUNT=0
|
NETWORK_FAIL_COUNT=0
|
||||||
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
|
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
|
||||||
|
watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" ""
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")"
|
||||||
set_last_action "network-recovery"
|
set_last_action "network-recovery"
|
||||||
RECOVERY_ACTION_TAKEN=1
|
RECOVERY_ACTION_TAKEN=1
|
||||||
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
|
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
|
||||||
|
watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}"
|
||||||
systemctl stop "${B_SIDE_SERVICE}" || true
|
systemctl stop "${B_SIDE_SERVICE}" || true
|
||||||
|
|
||||||
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
|
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
|
||||||
@@ -679,6 +811,7 @@ perform_network_recovery() {
|
|||||||
else
|
else
|
||||||
rc=$?
|
rc=$?
|
||||||
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
|
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
|
||||||
|
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}"
|
||||||
return "${rc}"
|
return "${rc}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -687,11 +820,13 @@ perform_network_recovery() {
|
|||||||
else
|
else
|
||||||
rc=$?
|
rc=$?
|
||||||
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
|
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
|
||||||
|
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}"
|
||||||
return "${rc}"
|
return "${rc}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
|
||||||
NETWORK_FAIL_COUNT=0
|
NETWORK_FAIL_COUNT=0
|
||||||
|
watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}"
|
||||||
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
|
||||||
restart_bside_targeted "network" "network-recovered"
|
restart_bside_targeted "network" "network-recovered"
|
||||||
return 0
|
return 0
|
||||||
@@ -707,11 +842,14 @@ blitz_require_command stat "${STEP}"
|
|||||||
blitz_require_command ping "${STEP}"
|
blitz_require_command ping "${STEP}"
|
||||||
blitz_require_command python3 "${STEP}"
|
blitz_require_command python3 "${STEP}"
|
||||||
blitz_prepare_runtime_dir
|
blitz_prepare_runtime_dir
|
||||||
|
blitz_require_run_context
|
||||||
|
|
||||||
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
|
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
|
||||||
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
|
||||||
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
|
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
|
||||||
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
|
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
|
||||||
|
WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl"
|
||||||
|
WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl"
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
fault_reason="none"
|
fault_reason="none"
|
||||||
@@ -735,7 +873,9 @@ while true; do
|
|||||||
if (( BACKOFF_UNTIL > now_sec )); then
|
if (( BACKOFF_UNTIL > now_sec )); then
|
||||||
fault_reason="backoff"
|
fault_reason="backoff"
|
||||||
recovery_state="backoff"
|
recovery_state="backoff"
|
||||||
|
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
|
||||||
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
||||||
|
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
|
||||||
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
@@ -824,6 +964,8 @@ while true; do
|
|||||||
full_restart_stack "ros-unhealthy" || true
|
full_restart_stack "ros-unhealthy" || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
|
||||||
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
||||||
|
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
|
||||||
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -33,8 +33,10 @@ blitz_load_boot_env() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
export BLITZ_BOOT_LOADING_ENV="1"
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source "${DEV_SCRIPT_DIR}/load-env.sh"
|
source "${DEV_SCRIPT_DIR}/load-env.sh"
|
||||||
|
unset BLITZ_BOOT_LOADING_ENV
|
||||||
|
|
||||||
for env_file in \
|
for env_file in \
|
||||||
"${BOOT_SCRIPT_DIR}/robot-boot.env" \
|
"${BOOT_SCRIPT_DIR}/robot-boot.env" \
|
||||||
@@ -51,8 +53,12 @@ blitz_load_boot_env() {
|
|||||||
default_time_server="$(blitz_host_from_addr "${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}" || true)"
|
default_time_server="$(blitz_host_from_addr "${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}" || true)"
|
||||||
|
|
||||||
export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}"
|
export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}"
|
||||||
|
export BLITZ_RUN_ROOT="${BLITZ_RUN_ROOT:-/var/log/blitz-robot}"
|
||||||
export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}"
|
export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}"
|
||||||
export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-/run/blitz-robot}"
|
export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-/run/blitz-robot}"
|
||||||
|
export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUN_CONTEXT_FILE:-${BLITZ_RUNTIME_DIR}/run-context.env}"
|
||||||
|
export BLITZ_RUN_ID_FILE="${BLITZ_RUN_ID_FILE:-${BLITZ_RUNTIME_DIR}/run-id}"
|
||||||
|
export BLITZ_CURRENT_RUN_LINK="${BLITZ_CURRENT_RUN_LINK:-${BLITZ_RUN_ROOT}/current}"
|
||||||
export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}"
|
export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}"
|
||||||
export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}"
|
export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}"
|
||||||
export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}"
|
export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}"
|
||||||
@@ -71,6 +77,16 @@ blitz_load_boot_env() {
|
|||||||
export BLITZ_WATCHDOG_INTERVAL_SEC="${BLITZ_WATCHDOG_INTERVAL_SEC:-5}"
|
export BLITZ_WATCHDOG_INTERVAL_SEC="${BLITZ_WATCHDOG_INTERVAL_SEC:-5}"
|
||||||
export BLITZ_HEALTH_STALE_SEC="${BLITZ_HEALTH_STALE_SEC:-15}"
|
export BLITZ_HEALTH_STALE_SEC="${BLITZ_HEALTH_STALE_SEC:-15}"
|
||||||
export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}"
|
export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}"
|
||||||
|
export BLITZ_KCP_STATS_INTERVAL_MS="${BLITZ_KCP_STATS_INTERVAL_MS:-1000}"
|
||||||
|
export BLITZ_CONTROL_LATENCY_LOG_ENABLED="${BLITZ_CONTROL_LATENCY_LOG_ENABLED:-1}"
|
||||||
|
export BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="${BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD:-100}"
|
||||||
|
export BLITZ_5G_LINK_LOG_INTERVAL_SEC="${BLITZ_5G_LINK_LOG_INTERVAL_SEC:-5}"
|
||||||
|
export BLITZ_JSONL_FLUSH_INTERVAL_MS="${BLITZ_JSONL_FLUSH_INTERVAL_MS:-1000}"
|
||||||
|
export BLITZ_JSONL_FLUSH_BYTES="${BLITZ_JSONL_FLUSH_BYTES:-262144}"
|
||||||
|
export BLITZ_JSONL_ROTATE_BYTES="${BLITZ_JSONL_ROTATE_BYTES:-134217728}"
|
||||||
|
export BLITZ_JSONL_ROTATE_FILES="${BLITZ_JSONL_ROTATE_FILES:-8}"
|
||||||
|
export BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC="${BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC:-5}"
|
||||||
|
export BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC="${BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC:-30}"
|
||||||
export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}"
|
export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}"
|
||||||
export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}"
|
export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}"
|
||||||
export BLITZ_GPS_MONITOR_ENABLED="${BLITZ_GPS_MONITOR_ENABLED:-1}"
|
export BLITZ_GPS_MONITOR_ENABLED="${BLITZ_GPS_MONITOR_ENABLED:-1}"
|
||||||
@@ -409,3 +425,212 @@ blitz_prepare_runtime_dir() {
|
|||||||
fi
|
fi
|
||||||
blitz_log "runtime-dir" "prepare" "success" "path=${runtime_dir}" 0
|
blitz_log "runtime-dir" "prepare" "success" "path=${runtime_dir}" 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blitz_prepare_run_root() {
|
||||||
|
local run_root
|
||||||
|
local run_dir
|
||||||
|
local incidents_dir
|
||||||
|
|
||||||
|
blitz_load_boot_env
|
||||||
|
run_root="${BLITZ_RUN_ROOT}"
|
||||||
|
run_dir="${run_root}/runs"
|
||||||
|
incidents_dir="${run_root}/incidents"
|
||||||
|
|
||||||
|
mkdir -p "${run_dir}" "${incidents_dir}"
|
||||||
|
if [[ "${EUID}" -eq 0 ]]; then
|
||||||
|
chown -R "root:${BLITZ_ROS_USER}" "${run_root}" 2>/dev/null || true
|
||||||
|
chmod 0775 "${run_root}" "${run_dir}" "${incidents_dir}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_load_run_context_env() {
|
||||||
|
local context_file="${1:-${BLITZ_RUN_CONTEXT_FILE:-}}"
|
||||||
|
|
||||||
|
if [[ -z "${context_file}" || ! -f "${context_file}" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -a
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
source "${context_file}"
|
||||||
|
set +a
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_read_run_id() {
|
||||||
|
local run_id_file="${BLITZ_RUN_ID_FILE:-}"
|
||||||
|
|
||||||
|
if [[ -z "${run_id_file}" || ! -f "${run_id_file}" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
tr -d '\r\n' < "${run_id_file}"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_utc_compact_timestamp() {
|
||||||
|
date -u '+%Y%m%dT%H%M%SZ'
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_new_run_id() {
|
||||||
|
printf '%s\n' "$(blitz_utc_compact_timestamp)"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_new_incident_id() {
|
||||||
|
local prefix="${1:-incident}"
|
||||||
|
printf '%s-%s-%d\n' "${prefix}" "$(blitz_utc_compact_timestamp)" "$$"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_new_instance_id() {
|
||||||
|
printf '%s-%d\n' "$(blitz_utc_compact_timestamp)" "$$"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_git_commit() {
|
||||||
|
git -C "${OMNISOCKETGO_ROOT}" rev-parse HEAD 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_git_dirty_flag() {
|
||||||
|
if git -C "${OMNISOCKETGO_ROOT}" diff --quiet --ignore-submodules=dirty >/dev/null 2>&1; then
|
||||||
|
printf '0\n'
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
printf '1\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_write_run_context() {
|
||||||
|
local run_id="$1"
|
||||||
|
local run_dir="$2"
|
||||||
|
local boot_id="$3"
|
||||||
|
local context_file="${BLITZ_RUN_CONTEXT_FILE}"
|
||||||
|
local id_file="${BLITZ_RUN_ID_FILE}"
|
||||||
|
local temp_context
|
||||||
|
local temp_info
|
||||||
|
local commit_hash
|
||||||
|
local dirty_flag
|
||||||
|
local started_at
|
||||||
|
|
||||||
|
commit_hash="$(blitz_git_commit)"
|
||||||
|
dirty_flag="$(blitz_git_dirty_flag)"
|
||||||
|
started_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||||
|
temp_context="${context_file}.tmp.$$"
|
||||||
|
temp_info="${run_dir}/run-info.json.tmp.$$"
|
||||||
|
|
||||||
|
mkdir -p "${run_dir}"
|
||||||
|
printf '%s\n' "${run_id}" > "${id_file}"
|
||||||
|
|
||||||
|
cat > "${temp_context}" <<EOF
|
||||||
|
BLITZ_RUN_ID=${run_id}
|
||||||
|
BLITZ_RUN_DIR=${run_dir}
|
||||||
|
BLITZ_BOOT_ID=${boot_id}
|
||||||
|
BLITZ_RUN_ROOT=${BLITZ_RUN_ROOT}
|
||||||
|
EOF
|
||||||
|
mv -f "${temp_context}" "${context_file}"
|
||||||
|
|
||||||
|
python3 - "${temp_info}" "${run_id}" "${run_dir}" "${boot_id}" "${started_at}" "${commit_hash}" "${dirty_flag}" "${HOSTNAME:-$(hostname)}" <<'PY'
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
path, run_id, run_dir, boot_id, started_at, commit_hash, dirty_flag, hostname = sys.argv[1:9]
|
||||||
|
payload = {
|
||||||
|
"run_id": run_id,
|
||||||
|
"run_dir": run_dir,
|
||||||
|
"boot_id": boot_id,
|
||||||
|
"started_at": started_at,
|
||||||
|
"hostname": hostname,
|
||||||
|
"git_commit": commit_hash,
|
||||||
|
"git_dirty": dirty_flag == "1",
|
||||||
|
"env": {
|
||||||
|
key: os.environ.get(key, "")
|
||||||
|
for key in sorted(os.environ)
|
||||||
|
if key.startswith(("BLITZ_", "OMNI_", "ROBOT_RECEIVER_"))
|
||||||
|
},
|
||||||
|
}
|
||||||
|
with open(path, "w", encoding="utf-8") as handle:
|
||||||
|
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
||||||
|
PY
|
||||||
|
mv -f "${temp_info}" "${run_dir}/run-info.json"
|
||||||
|
ln -sfn "${run_dir}" "${BLITZ_CURRENT_RUN_LINK}"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_init_run_context() {
|
||||||
|
local run_id
|
||||||
|
local boot_id
|
||||||
|
local run_dir
|
||||||
|
|
||||||
|
blitz_load_boot_env
|
||||||
|
blitz_prepare_runtime_dir
|
||||||
|
blitz_prepare_run_root
|
||||||
|
|
||||||
|
run_id="$(blitz_new_run_id)"
|
||||||
|
boot_id="$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || blitz_new_run_id)"
|
||||||
|
run_dir="${BLITZ_RUN_ROOT}/runs/${run_id}"
|
||||||
|
|
||||||
|
export BLITZ_RUN_ID="${run_id}"
|
||||||
|
export BLITZ_RUN_DIR="${run_dir}"
|
||||||
|
export BLITZ_BOOT_ID="${boot_id}"
|
||||||
|
blitz_write_run_context "${run_id}" "${run_dir}" "${boot_id}"
|
||||||
|
blitz_log "run-context" "init" "success" "run_id=${run_id} run_dir=${run_dir}" 0
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_require_run_context() {
|
||||||
|
blitz_load_boot_env
|
||||||
|
if blitz_load_run_context_env; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
blitz_log "run-context" "load" "failure" "missing ${BLITZ_RUN_CONTEXT_FILE}" 1
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_ensure_instance_id() {
|
||||||
|
if [[ -n "${BLITZ_INSTANCE_ID:-}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
export BLITZ_INSTANCE_ID="$(blitz_new_instance_id)"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_jsonl_rotate_if_needed() {
|
||||||
|
local path="$1"
|
||||||
|
local max_bytes="${2:-${BLITZ_JSONL_ROTATE_BYTES:-0}}"
|
||||||
|
local max_files="${3:-${BLITZ_JSONL_ROTATE_FILES:-0}}"
|
||||||
|
local size=0
|
||||||
|
local index
|
||||||
|
|
||||||
|
if [[ -z "${path}" || ! -f "${path}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if (( max_bytes <= 0 || max_files <= 0 )); then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
size="$(stat -c %s "${path}" 2>/dev/null || echo 0)"
|
||||||
|
if (( size < max_bytes )); then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
for (( index=max_files; index>=1; index-- )); do
|
||||||
|
if [[ "${index}" -eq "${max_files}" ]]; then
|
||||||
|
rm -f "${path}.${index}"
|
||||||
|
fi
|
||||||
|
if [[ -f "${path}.${index}" ]]; then
|
||||||
|
mv -f "${path}.${index}" "${path}.$(( index + 1 ))"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
mv -f "${path}" "${path}.1"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_jsonl_append_line() {
|
||||||
|
local path="$1"
|
||||||
|
local line="$2"
|
||||||
|
|
||||||
|
mkdir -p "$(dirname "${path}")"
|
||||||
|
blitz_jsonl_rotate_if_needed "${path}"
|
||||||
|
printf '%s\n' "${line}" >> "${path}"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_launch_incident_capture() {
|
||||||
|
local launch_script="${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh"
|
||||||
|
|
||||||
|
if [[ ! -f "${launch_script}" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
"${launch_script}" "$@" >/dev/null 2>&1 || return 1
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,9 +9,11 @@ STEP="disable"
|
|||||||
SYSTEMD_DEST_DIR="/etc/systemd/system"
|
SYSTEMD_DEST_DIR="/etc/systemd/system"
|
||||||
UNITS=(
|
UNITS=(
|
||||||
"blitz-watchdog.service"
|
"blitz-watchdog.service"
|
||||||
|
"blitz-5g-link-logger.service"
|
||||||
"blitz-b-side-omnid.service"
|
"blitz-b-side-omnid.service"
|
||||||
"blitz-ros-receiver.service"
|
"blitz-ros-receiver.service"
|
||||||
"blitz-5g-dial.service"
|
"blitz-5g-dial.service"
|
||||||
|
"blitz-run-context.service"
|
||||||
"blitz-boot-gate.service"
|
"blitz-boot-gate.service"
|
||||||
"blitz-robot.target"
|
"blitz-robot.target"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -54,9 +54,12 @@ touch "${BLITZ_LOG_FILE}"
|
|||||||
chmod 0644 "${BLITZ_LOG_FILE}"
|
chmod 0644 "${BLITZ_LOG_FILE}"
|
||||||
blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0
|
blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0
|
||||||
blitz_prepare_runtime_dir
|
blitz_prepare_runtime_dir
|
||||||
|
blitz_prepare_run_root
|
||||||
|
|
||||||
install_unit "blitz-boot-gate.service.in"
|
install_unit "blitz-boot-gate.service.in"
|
||||||
|
install_unit "blitz-run-context.service.in"
|
||||||
install_unit "blitz-5g-dial.service.in"
|
install_unit "blitz-5g-dial.service.in"
|
||||||
|
install_unit "blitz-5g-link-logger.service.in"
|
||||||
install_unit "blitz-ros-receiver.service.in"
|
install_unit "blitz-ros-receiver.service.in"
|
||||||
install_unit "blitz-b-side-omnid.service.in"
|
install_unit "blitz-b-side-omnid.service.in"
|
||||||
install_unit "blitz-watchdog.service.in"
|
install_unit "blitz-watchdog.service.in"
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
# Override machine-specific values in robot-boot.env.local.
|
# Override machine-specific values in robot-boot.env.local.
|
||||||
|
|
||||||
BLITZ_BOOT_DELAY_SEC="30"
|
BLITZ_BOOT_DELAY_SEC="30"
|
||||||
|
BLITZ_RUN_ROOT="/var/log/blitz-robot"
|
||||||
BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log"
|
BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log"
|
||||||
BLITZ_RUNTIME_DIR="/run/blitz-robot"
|
BLITZ_RUNTIME_DIR="/run/blitz-robot"
|
||||||
|
|
||||||
@@ -25,6 +26,16 @@ BLITZ_ROS_SOCKET_WAIT_SEC="20"
|
|||||||
BLITZ_WATCHDOG_INTERVAL_SEC="5"
|
BLITZ_WATCHDOG_INTERVAL_SEC="5"
|
||||||
BLITZ_HEALTH_STALE_SEC="15"
|
BLITZ_HEALTH_STALE_SEC="15"
|
||||||
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
|
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
|
||||||
|
BLITZ_KCP_STATS_INTERVAL_MS="1000"
|
||||||
|
BLITZ_CONTROL_LATENCY_LOG_ENABLED="1"
|
||||||
|
BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="100"
|
||||||
|
BLITZ_5G_LINK_LOG_INTERVAL_SEC="5"
|
||||||
|
BLITZ_JSONL_FLUSH_INTERVAL_MS="1000"
|
||||||
|
BLITZ_JSONL_FLUSH_BYTES="262144"
|
||||||
|
BLITZ_JSONL_ROTATE_BYTES="134217728"
|
||||||
|
BLITZ_JSONL_ROTATE_FILES="8"
|
||||||
|
BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC="5"
|
||||||
|
BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC="30"
|
||||||
BLITZ_NETWORK_FAIL_THRESHOLD="3"
|
BLITZ_NETWORK_FAIL_THRESHOLD="3"
|
||||||
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
|
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
|
||||||
BLITZ_GPS_MONITOR_ENABLED="1"
|
BLITZ_GPS_MONITOR_ENABLED="1"
|
||||||
|
|||||||
18
scripts/boot/start-5g-link-logger-service.sh
Normal file
18
scripts/boot/start-5g-link-logger-service.sh
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/common.sh"
|
||||||
|
|
||||||
|
STEP="5g-link-logger-service"
|
||||||
|
|
||||||
|
blitz_load_boot_env
|
||||||
|
blitz_require_run_context
|
||||||
|
|
||||||
|
export OMNI_BOOT_MODE="1"
|
||||||
|
export BLITZ_INSTANCE_ID="${BLITZ_INSTANCE_ID:-$(blitz_new_instance_id)}"
|
||||||
|
export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-${BLITZ_RUN_DIR}/b-5g-link-quality.${BLITZ_INSTANCE_ID}.jsonl}"
|
||||||
|
|
||||||
|
blitz_log "${STEP}" "start" "start" "exec bash ${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh" 0
|
||||||
|
exec bash "${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh"
|
||||||
@@ -8,6 +8,7 @@ source "${SCRIPT_DIR}/common.sh"
|
|||||||
STEP="b-side-omnid"
|
STEP="b-side-omnid"
|
||||||
|
|
||||||
blitz_load_boot_env
|
blitz_load_boot_env
|
||||||
|
blitz_require_run_context
|
||||||
|
|
||||||
blitz_require_executable "${OMNISOCKETGO_ROOT}/bin/b_side_omnid" "${STEP}"
|
blitz_require_executable "${OMNISOCKETGO_ROOT}/bin/b_side_omnid" "${STEP}"
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ source "${SCRIPT_DIR}/common.sh"
|
|||||||
STEP="ros-receiver"
|
STEP="ros-receiver"
|
||||||
|
|
||||||
blitz_load_boot_env
|
blitz_load_boot_env
|
||||||
|
blitz_require_run_context
|
||||||
|
|
||||||
blitz_require_file "/opt/ros/${ROS_DISTRO}/setup.bash" "${STEP}"
|
blitz_require_file "/opt/ros/${ROS_DISTRO}/setup.bash" "${STEP}"
|
||||||
blitz_require_file "${ROS_CONTROL_PY_DIR}/install/setup.bash" "${STEP}"
|
blitz_require_file "${ROS_CONTROL_PY_DIR}/install/setup.bash" "${STEP}"
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Blitz robot 5G dial
|
Description=Blitz robot 5G dial
|
||||||
After=blitz-boot-gate.service
|
PartOf=blitz-robot.target
|
||||||
Requires=blitz-boot-gate.service
|
After=blitz-run-context.service
|
||||||
|
Requires=blitz-run-context.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
18
scripts/boot/systemd/blitz-5g-link-logger.service.in
Normal file
18
scripts/boot/systemd/blitz-5g-link-logger.service.in
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Blitz robot 5G link logger
|
||||||
|
PartOf=blitz-robot.target
|
||||||
|
After=blitz-run-context.service blitz-5g-dial.service
|
||||||
|
Wants=blitz-run-context.service blitz-5g-dial.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
EnvironmentFile=-/run/blitz-robot/run-context.env
|
||||||
|
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
||||||
|
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-5g-link-logger-service.sh
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
StandardOutput=append:@BLITZ_LOG_FILE@
|
||||||
|
StandardError=append:@BLITZ_LOG_FILE@
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=blitz-robot.target
|
||||||
@@ -1,12 +1,15 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Blitz robot b-side omnid
|
Description=Blitz robot b-side omnid
|
||||||
After=blitz-5g-dial.service blitz-ros-receiver.service
|
PartOf=blitz-robot.target
|
||||||
Wants=blitz-5g-dial.service blitz-ros-receiver.service
|
After=blitz-run-context.service blitz-5g-dial.service blitz-ros-receiver.service
|
||||||
|
Wants=blitz-run-context.service blitz-5g-dial.service blitz-ros-receiver.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
EnvironmentFile=-/run/blitz-robot/run-context.env
|
||||||
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
||||||
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh
|
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh
|
||||||
|
ExecStopPost=/bin/bash -lc 'if [[ "${SERVICE_RESULT:-success}" != "success" ]]; then exec "@OMNISOCKETGO_ROOT@/scripts/boot/blitz-incident-capture-launch.sh" --source exec-stop-post --unit "%n" --result "${SERVICE_RESULT:-}" --exit-status "${EXIT_STATUS:-}" --reason b-side-service-exit; fi'
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=2
|
RestartSec=2
|
||||||
StandardOutput=append:@BLITZ_LOG_FILE@
|
StandardOutput=append:@BLITZ_LOG_FILE@
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Blitz robot boot gate
|
Description=Blitz robot boot gate
|
||||||
|
PartOf=blitz-robot.target
|
||||||
After=multi-user.target network-online.target
|
After=multi-user.target network-online.target
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Blitz robot boot chain
|
Description=Blitz robot boot chain
|
||||||
Wants=blitz-boot-gate.service
|
Wants=blitz-boot-gate.service
|
||||||
|
Wants=blitz-run-context.service
|
||||||
Wants=blitz-5g-dial.service
|
Wants=blitz-5g-dial.service
|
||||||
|
Wants=blitz-5g-link-logger.service
|
||||||
Wants=blitz-ros-receiver.service
|
Wants=blitz-ros-receiver.service
|
||||||
Wants=blitz-b-side-omnid.service
|
Wants=blitz-b-side-omnid.service
|
||||||
Wants=blitz-watchdog.service
|
Wants=blitz-watchdog.service
|
||||||
|
|||||||
@@ -1,15 +1,18 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Blitz robot ROS receiver
|
Description=Blitz robot ROS receiver
|
||||||
After=blitz-5g-dial.service
|
PartOf=blitz-robot.target
|
||||||
Wants=blitz-5g-dial.service
|
After=blitz-run-context.service blitz-5g-dial.service
|
||||||
|
Wants=blitz-run-context.service blitz-5g-dial.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
User=@BLITZ_ROS_USER@
|
User=@BLITZ_ROS_USER@
|
||||||
PermissionsStartOnly=true
|
PermissionsStartOnly=true
|
||||||
|
EnvironmentFile=-/run/blitz-robot/run-context.env
|
||||||
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
||||||
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh
|
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh
|
||||||
ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver
|
ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver
|
||||||
|
ExecStopPost=/bin/bash -lc 'if [[ "${SERVICE_RESULT:-success}" != "success" ]]; then exec "@OMNISOCKETGO_ROOT@/scripts/boot/blitz-incident-capture-launch.sh" --source exec-stop-post --unit "%n" --result "${SERVICE_RESULT:-}" --exit-status "${EXIT_STATUS:-}" --reason ros-service-exit; fi'
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=2
|
RestartSec=2
|
||||||
StandardOutput=append:@BLITZ_LOG_FILE@
|
StandardOutput=append:@BLITZ_LOG_FILE@
|
||||||
|
|||||||
15
scripts/boot/systemd/blitz-run-context.service.in
Normal file
15
scripts/boot/systemd/blitz-run-context.service.in
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Blitz robot run context
|
||||||
|
PartOf=blitz-robot.target
|
||||||
|
After=blitz-boot-gate.service
|
||||||
|
Requires=blitz-boot-gate.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=yes
|
||||||
|
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-run-context.sh
|
||||||
|
StandardOutput=append:@BLITZ_LOG_FILE@
|
||||||
|
StandardError=append:@BLITZ_LOG_FILE@
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=blitz-robot.target
|
||||||
@@ -1,10 +1,12 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Blitz robot health watchdog
|
Description=Blitz robot health watchdog
|
||||||
After=blitz-b-side-omnid.service blitz-ros-receiver.service
|
PartOf=blitz-robot.target
|
||||||
Wants=blitz-b-side-omnid.service blitz-ros-receiver.service
|
After=blitz-run-context.service blitz-b-side-omnid.service blitz-ros-receiver.service
|
||||||
|
Wants=blitz-run-context.service blitz-b-side-omnid.service blitz-ros-receiver.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
EnvironmentFile=-/run/blitz-robot/run-context.env
|
||||||
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
|
||||||
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-watchdog.sh
|
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-watchdog.sh
|
||||||
Restart=always
|
Restart=always
|
||||||
|
|||||||
@@ -134,3 +134,162 @@ export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR:-${ROBOT_SIDE_OMNISO
|
|||||||
export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}"
|
export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}"
|
||||||
export OMNI_CONTROL_UNIX_SOCKET_PATH="${OMNI_CONTROL_UNIX_SOCKET_PATH:-${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}}"
|
export OMNI_CONTROL_UNIX_SOCKET_PATH="${OMNI_CONTROL_UNIX_SOCKET_PATH:-${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}}"
|
||||||
export B_SIDE_OMNID_USE_SUDO="${B_SIDE_OMNID_USE_SUDO:-1}"
|
export B_SIDE_OMNID_USE_SUDO="${B_SIDE_OMNID_USE_SUDO:-1}"
|
||||||
|
export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-${OMNISOCKETGO_ROOT}/logs/runtime}"
|
||||||
|
export BLITZ_RUN_ROOT="${BLITZ_RUN_ROOT:-${OMNISOCKETGO_ROOT}/logs}"
|
||||||
|
export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUN_CONTEXT_FILE:-${BLITZ_RUNTIME_DIR}/run-context.env}"
|
||||||
|
export BLITZ_RUN_ID_FILE="${BLITZ_RUN_ID_FILE:-${BLITZ_RUNTIME_DIR}/run-id}"
|
||||||
|
export BLITZ_CURRENT_RUN_LINK="${BLITZ_CURRENT_RUN_LINK:-${BLITZ_RUN_ROOT}/current}"
|
||||||
|
export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}"
|
||||||
|
export BLITZ_5G_MODEM_SUBNET="${BLITZ_5G_MODEM_SUBNET:-192.168.224.0/22}"
|
||||||
|
export BLITZ_5G_GATEWAY="${BLITZ_5G_GATEWAY:-192.168.225.1}"
|
||||||
|
export BLITZ_5G_ROUTE_TARGETS="${BLITZ_5G_ROUTE_TARGETS:-106.55.173.235}"
|
||||||
|
export BLITZ_5G_INFO_JSON="${BLITZ_5G_INFO_JSON:-${OMNISOCKETGO_ROOT}/scripts/boot/modem_network_info.json}"
|
||||||
|
export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-}"
|
||||||
|
export BLITZ_KCP_STATS_INTERVAL_MS="${BLITZ_KCP_STATS_INTERVAL_MS:-1000}"
|
||||||
|
export BLITZ_CONTROL_LATENCY_LOG_ENABLED="${BLITZ_CONTROL_LATENCY_LOG_ENABLED:-1}"
|
||||||
|
export BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="${BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD:-100}"
|
||||||
|
export BLITZ_5G_LINK_LOG_INTERVAL_SEC="${BLITZ_5G_LINK_LOG_INTERVAL_SEC:-5}"
|
||||||
|
export BLITZ_JSONL_FLUSH_INTERVAL_MS="${BLITZ_JSONL_FLUSH_INTERVAL_MS:-1000}"
|
||||||
|
export BLITZ_JSONL_FLUSH_BYTES="${BLITZ_JSONL_FLUSH_BYTES:-262144}"
|
||||||
|
export BLITZ_JSONL_ROTATE_BYTES="${BLITZ_JSONL_ROTATE_BYTES:-134217728}"
|
||||||
|
export BLITZ_JSONL_ROTATE_FILES="${BLITZ_JSONL_ROTATE_FILES:-8}"
|
||||||
|
|
||||||
|
blitz_dev_utc_compact_timestamp() {
|
||||||
|
date -u '+%Y%m%dT%H%M%SZ'
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_git_commit() {
|
||||||
|
git -C "${OMNISOCKETGO_ROOT}" rev-parse HEAD 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_git_dirty_flag() {
|
||||||
|
if git -C "${OMNISOCKETGO_ROOT}" diff --quiet --ignore-submodules=dirty >/dev/null 2>&1; then
|
||||||
|
printf '0\n'
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
printf '1\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_prepare_dirs() {
|
||||||
|
mkdir -p "${BLITZ_RUNTIME_DIR}" "${BLITZ_RUN_ROOT}/runs" "${BLITZ_RUN_ROOT}/incidents"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_write_run_info() {
|
||||||
|
local run_dir="$1"
|
||||||
|
local run_id="$2"
|
||||||
|
local boot_id="$3"
|
||||||
|
local tmp_info="${run_dir}/run-info.json.tmp.$$"
|
||||||
|
local started_at
|
||||||
|
local commit_hash
|
||||||
|
local dirty_flag
|
||||||
|
|
||||||
|
started_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||||
|
commit_hash="$(blitz_dev_git_commit)"
|
||||||
|
dirty_flag="$(blitz_dev_git_dirty_flag)"
|
||||||
|
|
||||||
|
python3 - "${tmp_info}" "${run_id}" "${run_dir}" "${boot_id}" "${started_at}" "${commit_hash}" "${dirty_flag}" "${HOSTNAME:-$(hostname)}" <<'PY'
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
path, run_id, run_dir, boot_id, started_at, commit_hash, dirty_flag, hostname = sys.argv[1:9]
|
||||||
|
payload = {
|
||||||
|
"run_id": run_id,
|
||||||
|
"run_dir": run_dir,
|
||||||
|
"boot_id": boot_id,
|
||||||
|
"started_at": started_at,
|
||||||
|
"hostname": hostname,
|
||||||
|
"git_commit": commit_hash,
|
||||||
|
"git_dirty": dirty_flag == "1",
|
||||||
|
"env": {
|
||||||
|
key: os.environ.get(key, "")
|
||||||
|
for key in sorted(os.environ)
|
||||||
|
if key.startswith(("BLITZ_", "OMNI_", "ROBOT_RECEIVER_"))
|
||||||
|
},
|
||||||
|
}
|
||||||
|
with open(path, "w", encoding="utf-8") as handle:
|
||||||
|
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
||||||
|
PY
|
||||||
|
mv -f "${tmp_info}" "${run_dir}/run-info.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_init_run_context() {
|
||||||
|
local run_id="${1:-$(blitz_dev_utc_compact_timestamp)}"
|
||||||
|
local boot_id="dev-$(blitz_dev_utc_compact_timestamp)"
|
||||||
|
local run_dir="${BLITZ_RUN_ROOT}/runs/${run_id}"
|
||||||
|
local tmp_context="${BLITZ_RUN_CONTEXT_FILE}.tmp.$$"
|
||||||
|
|
||||||
|
blitz_dev_prepare_dirs
|
||||||
|
mkdir -p "${run_dir}"
|
||||||
|
export BLITZ_RUN_ID="${run_id}"
|
||||||
|
export BLITZ_RUN_DIR="${run_dir}"
|
||||||
|
export BLITZ_BOOT_ID="${boot_id}"
|
||||||
|
printf '%s\n' "${run_id}" > "${BLITZ_RUN_ID_FILE}"
|
||||||
|
cat > "${tmp_context}" <<EOF
|
||||||
|
BLITZ_RUN_ID=${run_id}
|
||||||
|
BLITZ_RUN_DIR=${run_dir}
|
||||||
|
BLITZ_BOOT_ID=${boot_id}
|
||||||
|
BLITZ_RUN_ROOT=${BLITZ_RUN_ROOT}
|
||||||
|
EOF
|
||||||
|
mv -f "${tmp_context}" "${BLITZ_RUN_CONTEXT_FILE}"
|
||||||
|
ln -sfn "${run_dir}" "${BLITZ_CURRENT_RUN_LINK}"
|
||||||
|
blitz_dev_write_run_info "${run_dir}" "${run_id}" "${boot_id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_load_run_context() {
|
||||||
|
if [[ ! -f "${BLITZ_RUN_CONTEXT_FILE}" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
set -a
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
source "${BLITZ_RUN_CONTEXT_FILE}"
|
||||||
|
set +a
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_ensure_run_context() {
|
||||||
|
if blitz_dev_load_run_context; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
blitz_dev_init_run_context
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_reset_run_context() {
|
||||||
|
rm -f "${BLITZ_RUN_CONTEXT_FILE}" "${BLITZ_RUN_ID_FILE}"
|
||||||
|
blitz_dev_init_run_context
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_init_instance_context() {
|
||||||
|
if [[ -z "${BLITZ_INSTANCE_ID:-}" ]]; then
|
||||||
|
export BLITZ_INSTANCE_ID="$(blitz_dev_utc_compact_timestamp)-$$"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_component_log_path() {
|
||||||
|
local stem="$1"
|
||||||
|
printf '%s/%s.%s.jsonl\n' "${BLITZ_RUN_DIR}" "${stem}" "${BLITZ_INSTANCE_ID}"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_prepare_backend_logging_env() {
|
||||||
|
blitz_dev_init_instance_context
|
||||||
|
if [[ "${OMNI_NETWORK_SUMMARY_LOG_PATH}" == "${OMNISOCKETGO_ROOT}/logs/a-network-summary.jsonl" ]]; then
|
||||||
|
export OMNI_NETWORK_SUMMARY_LOG_PATH
|
||||||
|
OMNI_NETWORK_SUMMARY_LOG_PATH="$(blitz_dev_component_log_path "a-network-summary")"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_prepare_bside_logging_env() {
|
||||||
|
blitz_dev_init_instance_context
|
||||||
|
export BLITZ_KCP_STATS_LOG_PATH="${BLITZ_KCP_STATS_LOG_PATH:-$(blitz_dev_component_log_path "b-kcp-session-stats")}"
|
||||||
|
export BLITZ_CONTROL_LATENCY_LOG_PATH="${BLITZ_CONTROL_LATENCY_LOG_PATH:-$(blitz_dev_component_log_path "b-control-latency")}"
|
||||||
|
}
|
||||||
|
|
||||||
|
blitz_dev_prepare_5g_logging_env() {
|
||||||
|
blitz_dev_init_instance_context
|
||||||
|
export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-$(blitz_dev_component_log_path "b-5g-link-quality")}"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ "${BLITZ_SKIP_DEV_RUN_CONTEXT_INIT:-0}" != "1" && "${BLITZ_BOOT_LOADING_ENV:-0}" != "1" && "${OMNI_BOOT_MODE:-0}" != "1" ]]; then
|
||||||
|
blitz_dev_ensure_run_context
|
||||||
|
elif [[ -f "${BLITZ_RUN_CONTEXT_FILE}" ]]; then
|
||||||
|
blitz_dev_load_run_context || true
|
||||||
|
fi
|
||||||
|
|||||||
10
scripts/dev/reset-run-context.sh
Normal file
10
scripts/dev/reset-run-context.sh
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
export BLITZ_SKIP_DEV_RUN_CONTEXT_INIT="1"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/load-env.sh"
|
||||||
|
|
||||||
|
blitz_dev_reset_run_context
|
||||||
|
printf 'run_id=%s\nrun_dir=%s\n' "${BLITZ_RUN_ID}" "${BLITZ_RUN_DIR}"
|
||||||
9
scripts/dev/start-5g-link-logger.sh
Normal file
9
scripts/dev/start-5g-link-logger.sh
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${SCRIPT_DIR}/load-env.sh"
|
||||||
|
|
||||||
|
blitz_dev_prepare_5g_logging_env
|
||||||
|
exec bash "${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh"
|
||||||
@@ -4,6 +4,7 @@ set -euo pipefail
|
|||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source "${SCRIPT_DIR}/load-env.sh"
|
source "${SCRIPT_DIR}/load-env.sh"
|
||||||
|
blitz_dev_prepare_bside_logging_env
|
||||||
|
|
||||||
cd "${OMNISOCKETGO_ROOT}"
|
cd "${OMNISOCKETGO_ROOT}"
|
||||||
|
|
||||||
@@ -14,6 +15,24 @@ export OMNI_VIDEO_RELAY_VIA="${OMNI_VIDEO_RELAY_VIA}"
|
|||||||
export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR}"
|
export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR}"
|
||||||
export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA}"
|
export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA}"
|
||||||
|
|
||||||
|
logger_pid=""
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${logger_pid}" ]]; then
|
||||||
|
kill "${logger_pid}" 2>/dev/null || true
|
||||||
|
wait "${logger_pid}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
start_5g_link_logger_if_needed() {
|
||||||
|
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
bash "${SCRIPT_DIR}/start-5g-link-logger.sh" &
|
||||||
|
logger_pid=$!
|
||||||
|
echo "[start-b-side-omnid] 5G link logger -> ${BLITZ_5G_LINK_LOG_PATH:-unset}" >&2
|
||||||
|
}
|
||||||
|
|
||||||
if [[ ! -x "./bin/b_side_omnid" ]]; then
|
if [[ ! -x "./bin/b_side_omnid" ]]; then
|
||||||
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
|
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
|
||||||
echo "Missing ./bin/b_side_omnid in boot mode; build it before enabling the autostart service." >&2
|
echo "Missing ./bin/b_side_omnid in boot mode; build it before enabling the autostart service." >&2
|
||||||
@@ -23,12 +42,14 @@ if [[ ! -x "./bin/b_side_omnid" ]]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
launch_b_side_omnid() {
|
launch_b_side_omnid() {
|
||||||
|
trap cleanup EXIT INT TERM
|
||||||
|
start_5g_link_logger_if_needed
|
||||||
bash "${SCRIPT_DIR}/apply-camera-controls.sh"
|
bash "${SCRIPT_DIR}/apply-camera-controls.sh"
|
||||||
exec ./bin/b_side_omnid
|
./bin/b_side_omnid
|
||||||
}
|
}
|
||||||
|
|
||||||
if [[ "${B_SIDE_OMNID_USE_SUDO}" == "1" && "${EUID}" -ne 0 ]]; then
|
if [[ "${B_SIDE_OMNID_USE_SUDO}" == "1" && "${EUID}" -ne 0 ]]; then
|
||||||
exec sudo -E bash -lc 'cd "$1" && bash "$2" && exec "$3"' _ "${OMNISOCKETGO_ROOT}" "${SCRIPT_DIR}/apply-camera-controls.sh" "./bin/b_side_omnid"
|
exec sudo -E bash -lc 'cd "$1" && export B_SIDE_OMNID_USE_SUDO=0 && exec bash "$2"' _ "${OMNISOCKETGO_ROOT}" "${SCRIPT_DIR}/start-b-side-omnid.sh"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
launch_b_side_omnid
|
launch_b_side_omnid
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source "${SCRIPT_DIR}/load-env.sh"
|
source "${SCRIPT_DIR}/load-env.sh"
|
||||||
require_robot_command_center_root
|
require_robot_command_center_root
|
||||||
|
blitz_dev_prepare_backend_logging_env
|
||||||
|
|
||||||
if [[ ! -d "${PYTHON_VENV_PATH}" ]]; then
|
if [[ ! -d "${PYTHON_VENV_PATH}" ]]; then
|
||||||
"${PYTHON3_BIN}" -m venv "${PYTHON_VENV_PATH}"
|
"${PYTHON3_BIN}" -m venv "${PYTHON_VENV_PATH}"
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ kcp_packet_debug_logger_t *kcp_packet_debug_open_jsonl(const char *path) {
|
|||||||
fclose(file);
|
fclose(file);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
omni_file_logger_init(&logger->file_logger, file);
|
omni_file_logger_init_path(&logger->file_logger, file, path, 0);
|
||||||
logger->enabled = 1;
|
logger->enabled = 1;
|
||||||
return logger;
|
return logger;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ kcp_session_stats_logger_t *kcp_session_stats_open_jsonl(const char *path) {
|
|||||||
fclose(file);
|
fclose(file);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
omni_file_logger_init(&logger->file_logger, file);
|
omni_file_logger_init_path(&logger->file_logger, file, path, 0);
|
||||||
logger->enabled = 1;
|
logger->enabled = 1;
|
||||||
return logger;
|
return logger;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ latency_logger_t *latencylog_open_jsonl(const char *path) {
|
|||||||
fclose(file);
|
fclose(file);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
omni_file_logger_init(&logger->file_logger, file);
|
omni_file_logger_init_path(&logger->file_logger, file, path, 0);
|
||||||
logger->enabled = 1;
|
logger->enabled = 1;
|
||||||
return logger;
|
return logger;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -544,9 +544,217 @@ const char *omni_path_base_name(const char *path) {
|
|||||||
return slash == NULL ? path : slash + 1;
|
return slash == NULL ? path : slash + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint64_t omni_now_monotonic_ms64(void) {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return (uint64_t) ts.tv_sec * 1000ULL + (uint64_t) (ts.tv_nsec / 1000000L);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int omni_positive_int_env(const char *name, int default_value) {
|
||||||
|
const char *raw = getenv(name);
|
||||||
|
long parsed;
|
||||||
|
char *endptr = NULL;
|
||||||
|
|
||||||
|
if (raw == NULL || raw[0] == '\0') {
|
||||||
|
return default_value;
|
||||||
|
}
|
||||||
|
parsed = strtol(raw, &endptr, 10);
|
||||||
|
if (endptr == raw || *endptr != '\0' || parsed <= 0) {
|
||||||
|
return default_value;
|
||||||
|
}
|
||||||
|
return (int) parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t omni_positive_size_env(const char *name, size_t default_value) {
|
||||||
|
const char *raw = getenv(name);
|
||||||
|
unsigned long long parsed;
|
||||||
|
char *endptr = NULL;
|
||||||
|
|
||||||
|
if (raw == NULL || raw[0] == '\0') {
|
||||||
|
return default_value;
|
||||||
|
}
|
||||||
|
parsed = strtoull(raw, &endptr, 10);
|
||||||
|
if (endptr == raw || *endptr != '\0' || parsed == 0ULL) {
|
||||||
|
return default_value;
|
||||||
|
}
|
||||||
|
return (size_t) parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int omni_file_logger_flush_locked(omni_file_logger_t *logger, uint64_t now_ms) {
|
||||||
|
if (logger == NULL || logger->file == NULL) {
|
||||||
|
errno = EINVAL;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (fflush(logger->file) != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
logger->buffered_bytes = 0U;
|
||||||
|
logger->last_flush_monotonic_ms = now_ms;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int omni_build_rotated_path(char *buffer, size_t buffer_len, const char *path, int suffix) {
|
||||||
|
size_t path_len;
|
||||||
|
int written;
|
||||||
|
|
||||||
|
if (buffer == NULL || buffer_len == 0U || path == NULL || path[0] == '\0') {
|
||||||
|
errno = EINVAL;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
path_len = strlen(path);
|
||||||
|
if (path_len + 16U >= buffer_len) {
|
||||||
|
errno = ENAMETOOLONG;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
memcpy(buffer, path, path_len);
|
||||||
|
written = snprintf(buffer + path_len, buffer_len - path_len, ".%d", suffix);
|
||||||
|
if (written < 0 || (size_t) written >= buffer_len - path_len) {
|
||||||
|
errno = ENAMETOOLONG;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int omni_file_logger_reopen_append_locked(omni_file_logger_t *logger) {
|
||||||
|
struct stat st;
|
||||||
|
FILE *file;
|
||||||
|
|
||||||
|
if (logger == NULL || logger->path[0] == '\0') {
|
||||||
|
errno = EINVAL;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
file = fopen(logger->path, "ab");
|
||||||
|
if (file == NULL) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger->file = file;
|
||||||
|
logger->current_bytes = 0U;
|
||||||
|
if (stat(logger->path, &st) == 0) {
|
||||||
|
logger->current_bytes = (size_t) st.st_size;
|
||||||
|
}
|
||||||
|
logger->buffered_bytes = 0U;
|
||||||
|
logger->last_flush_monotonic_ms = omni_now_monotonic_ms64();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int omni_file_logger_recover_after_rotate_locked(omni_file_logger_t *logger, const char *rotated_current_path) {
|
||||||
|
int reopen_errno;
|
||||||
|
|
||||||
|
if (omni_file_logger_reopen_append_locked(logger) == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
reopen_errno = errno;
|
||||||
|
if (rotated_current_path != NULL && rotated_current_path[0] != '\0') {
|
||||||
|
if (rename(rotated_current_path, logger->path) == 0) {
|
||||||
|
if (omni_file_logger_reopen_append_locked(logger) == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
errno = reopen_errno;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int omni_file_logger_rotate_locked(omni_file_logger_t *logger) {
|
||||||
|
int index;
|
||||||
|
int saved_errno = 0;
|
||||||
|
int should_recover = 0;
|
||||||
|
char rotated_current_path[PATH_MAX];
|
||||||
|
char from_path[PATH_MAX];
|
||||||
|
char to_path[PATH_MAX];
|
||||||
|
|
||||||
|
if (logger == NULL || logger->path[0] == '\0' || logger->max_bytes == 0U || logger->max_files <= 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
rotated_current_path[0] = '\0';
|
||||||
|
if (logger->file != NULL) {
|
||||||
|
if (omni_file_logger_flush_locked(logger, omni_now_monotonic_ms64()) != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
should_recover = 1;
|
||||||
|
if (fclose(logger->file) != 0) {
|
||||||
|
logger->file = NULL;
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
logger->file = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (omni_build_rotated_path(from_path, sizeof(from_path), logger->path, logger->max_files) != 0) {
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
unlink(from_path);
|
||||||
|
for (index = logger->max_files - 1; index >= 1; --index) {
|
||||||
|
if (omni_build_rotated_path(from_path, sizeof(from_path), logger->path, index) != 0 ||
|
||||||
|
omni_build_rotated_path(to_path, sizeof(to_path), logger->path, index + 1) != 0) {
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
if (rename(from_path, to_path) != 0 && errno != ENOENT) {
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (omni_build_rotated_path(to_path, sizeof(to_path), logger->path, 1) != 0) {
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
if (rename(logger->path, to_path) != 0 && errno != ENOENT) {
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
snprintf(rotated_current_path, sizeof(rotated_current_path), "%s", to_path);
|
||||||
|
|
||||||
|
if (omni_file_logger_reopen_append_locked(logger) != 0) {
|
||||||
|
saved_errno = errno;
|
||||||
|
goto recover;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
recover:
|
||||||
|
if (should_recover) {
|
||||||
|
int recover_errno = saved_errno != 0 ? saved_errno : errno;
|
||||||
|
if (omni_file_logger_recover_after_rotate_locked(logger, rotated_current_path) == 0) {
|
||||||
|
errno = recover_errno;
|
||||||
|
} else if (saved_errno != 0) {
|
||||||
|
errno = saved_errno;
|
||||||
|
}
|
||||||
|
} else if (saved_errno != 0) {
|
||||||
|
errno = saved_errno;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
void omni_file_logger_init(omni_file_logger_t *logger, FILE *file) {
|
void omni_file_logger_init(omni_file_logger_t *logger, FILE *file) {
|
||||||
|
memset(logger, 0, sizeof(*logger));
|
||||||
logger->file = file;
|
logger->file = file;
|
||||||
pthread_mutex_init(&logger->mutex, NULL);
|
pthread_mutex_init(&logger->mutex, NULL);
|
||||||
|
logger->flush_bytes = 1U;
|
||||||
|
logger->flush_interval_ms = 0;
|
||||||
|
logger->immediate_flush = 1;
|
||||||
|
logger->last_flush_monotonic_ms = omni_now_monotonic_ms64();
|
||||||
|
}
|
||||||
|
|
||||||
|
void omni_file_logger_init_path(omni_file_logger_t *logger, FILE *file, const char *path, int immediate_flush) {
|
||||||
|
struct stat st;
|
||||||
|
|
||||||
|
omni_file_logger_init(logger, file);
|
||||||
|
if (path != NULL && path[0] != '\0') {
|
||||||
|
snprintf(logger->path, sizeof(logger->path), "%s", path);
|
||||||
|
if (stat(path, &st) == 0) {
|
||||||
|
logger->current_bytes = (size_t) st.st_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger->flush_bytes = omni_positive_size_env("BLITZ_JSONL_FLUSH_BYTES", 262144U);
|
||||||
|
logger->flush_interval_ms = omni_positive_int_env("BLITZ_JSONL_FLUSH_INTERVAL_MS", 1000);
|
||||||
|
logger->max_bytes = omni_positive_size_env("BLITZ_JSONL_ROTATE_BYTES", 134217728U);
|
||||||
|
logger->max_files = omni_positive_int_env("BLITZ_JSONL_ROTATE_FILES", 8);
|
||||||
|
logger->immediate_flush = immediate_flush != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void omni_file_logger_destroy(omni_file_logger_t *logger) {
|
void omni_file_logger_destroy(omni_file_logger_t *logger) {
|
||||||
@@ -555,13 +763,32 @@ void omni_file_logger_destroy(omni_file_logger_t *logger) {
|
|||||||
|
|
||||||
int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line) {
|
int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line) {
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
|
size_t line_len;
|
||||||
|
uint64_t now_ms;
|
||||||
if (logger == NULL || logger->file == NULL || line == NULL) {
|
if (logger == NULL || logger->file == NULL || line == NULL) {
|
||||||
errno = EINVAL;
|
errno = EINVAL;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
line_len = strlen(line) + 1U;
|
||||||
|
now_ms = omni_now_monotonic_ms64();
|
||||||
pthread_mutex_lock(&logger->mutex);
|
pthread_mutex_lock(&logger->mutex);
|
||||||
if (fputs(line, logger->file) == EOF || fputc('\n', logger->file) == EOF || fflush(logger->file) != 0) {
|
if (fputs(line, logger->file) == EOF || fputc('\n', logger->file) == EOF) {
|
||||||
rc = -1;
|
rc = -1;
|
||||||
|
} else {
|
||||||
|
logger->current_bytes += line_len;
|
||||||
|
logger->buffered_bytes += line_len;
|
||||||
|
if (logger->immediate_flush ||
|
||||||
|
logger->buffered_bytes >= logger->flush_bytes ||
|
||||||
|
(logger->flush_interval_ms > 0 && now_ms - logger->last_flush_monotonic_ms >= (uint64_t) logger->flush_interval_ms)) {
|
||||||
|
if (omni_file_logger_flush_locked(logger, now_ms) != 0) {
|
||||||
|
rc = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (rc == 0 && logger->max_bytes > 0U && logger->current_bytes >= logger->max_bytes) {
|
||||||
|
if (omni_file_logger_rotate_locked(logger) != 0) {
|
||||||
|
rc = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(&logger->mutex);
|
pthread_mutex_unlock(&logger->mutex);
|
||||||
return rc;
|
return rc;
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ tx_timestamp_debug_logger_t *tx_timestamp_debug_open_jsonl(const char *path) {
|
|||||||
fclose(file);
|
fclose(file);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
omni_file_logger_init(&logger->file_logger, file);
|
omni_file_logger_init_path(&logger->file_logger, file, path, 0);
|
||||||
logger->enabled = 1;
|
logger->enabled = 1;
|
||||||
return logger;
|
return logger;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -213,6 +213,8 @@ void video_pipeline_config_init(video_pipeline_config_t *config) {
|
|||||||
config->hard_backpressure_hold_ms = VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT;
|
config->hard_backpressure_hold_ms = VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT;
|
||||||
config->server_idle_reconnect_ms = VIDEO_DEFAULT_SERVER_IDLE_RECONNECT_MS;
|
config->server_idle_reconnect_ms = VIDEO_DEFAULT_SERVER_IDLE_RECONNECT_MS;
|
||||||
config->frame_stall_reconnect_ms = VIDEO_DEFAULT_FRAME_STALL_RECONNECT_MS;
|
config->frame_stall_reconnect_ms = VIDEO_DEFAULT_FRAME_STALL_RECONNECT_MS;
|
||||||
|
config->stats_logger = NULL;
|
||||||
|
config->stats_interval_ms = 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
void video_pipeline_config_load_env(video_pipeline_config_t *config) {
|
void video_pipeline_config_load_env(video_pipeline_config_t *config) {
|
||||||
@@ -235,6 +237,7 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config) {
|
|||||||
config->hard_backpressure_hold_ms = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS", config->hard_backpressure_hold_ms);
|
config->hard_backpressure_hold_ms = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS", config->hard_backpressure_hold_ms);
|
||||||
config->server_idle_reconnect_ms = env_int_or_default("OMNI_VIDEO_SERVER_IDLE_RECONNECT_MS", config->server_idle_reconnect_ms);
|
config->server_idle_reconnect_ms = env_int_or_default("OMNI_VIDEO_SERVER_IDLE_RECONNECT_MS", config->server_idle_reconnect_ms);
|
||||||
config->frame_stall_reconnect_ms = env_int_or_default("OMNI_VIDEO_FRAME_STALL_RECONNECT_MS", config->frame_stall_reconnect_ms);
|
config->frame_stall_reconnect_ms = env_int_or_default("OMNI_VIDEO_FRAME_STALL_RECONNECT_MS", config->frame_stall_reconnect_ms);
|
||||||
|
config->stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", config->stats_interval_ms);
|
||||||
}
|
}
|
||||||
|
|
||||||
int video_pipeline_stats_init(video_pipeline_stats_t *stats) {
|
int video_pipeline_stats_init(video_pipeline_stats_t *stats) {
|
||||||
@@ -600,8 +603,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config
|
|||||||
&options,
|
&options,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
config->stats_logger,
|
||||||
KCP_DEFAULT_STATS_INTERVAL_MS
|
config->stats_interval_ms
|
||||||
);
|
);
|
||||||
if (sender->client == NULL) {
|
if (sender->client == NULL) {
|
||||||
return -1;
|
return -1;
|
||||||
|
|||||||
@@ -186,6 +186,8 @@ void video_pipeline_config_init(video_pipeline_config_t *config) {
|
|||||||
config->output_height = VIDEO_OUTPUT_HEIGHT_DEFAULT;
|
config->output_height = VIDEO_OUTPUT_HEIGHT_DEFAULT;
|
||||||
config->max_frames = 0;
|
config->max_frames = 0;
|
||||||
config->enable_timing_logs = 0;
|
config->enable_timing_logs = 0;
|
||||||
|
config->stats_logger = NULL;
|
||||||
|
config->stats_interval_ms = 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
void video_pipeline_config_load_env(video_pipeline_config_t *config) {
|
void video_pipeline_config_load_env(video_pipeline_config_t *config) {
|
||||||
@@ -203,6 +205,7 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config) {
|
|||||||
config->max_frames = atoi(getenv("OMNI_VIDEO_MAX_FRAMES"));
|
config->max_frames = atoi(getenv("OMNI_VIDEO_MAX_FRAMES"));
|
||||||
}
|
}
|
||||||
config->enable_timing_logs = env_flag_or_default("OMNI_VIDEO_DEBUG_TIMING", config->enable_timing_logs);
|
config->enable_timing_logs = env_flag_or_default("OMNI_VIDEO_DEBUG_TIMING", config->enable_timing_logs);
|
||||||
|
config->stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", config->stats_interval_ms);
|
||||||
}
|
}
|
||||||
|
|
||||||
int video_pipeline_stats_init(video_pipeline_stats_t *stats) {
|
int video_pipeline_stats_init(video_pipeline_stats_t *stats) {
|
||||||
@@ -564,8 +567,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config
|
|||||||
&options,
|
&options,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
config->stats_logger,
|
||||||
KCP_DEFAULT_STATS_INTERVAL_MS
|
config->stats_interval_ms
|
||||||
);
|
);
|
||||||
if (sender->client == NULL) {
|
if (sender->client == NULL) {
|
||||||
return -1;
|
return -1;
|
||||||
|
|||||||
Reference in New Issue
Block a user