Compare commits

...

28 Commits

Author SHA1 Message Date
de3f5c9677 fix: 前端时钟校准问题 2026-04-18 17:01:13 +08:00
ae2f1c3156 fix: 前后端时钟问题 2026-04-18 16:07:20 +08:00
212459a8e4 feat: 增加日志模块 2026-04-18 12:52:39 +08:00
b700dab484 fix: 传输方向反了 2026-04-17 22:01:04 +08:00
57d79a2759 fix: 传输方向反了! 2026-04-17 17:17:59 +08:00
1c845ba51e fix: C端日志采样、每条 KCP 连接每秒 1 次 periodic snapshot 2026-04-16 11:32:48 +08:00
d64329214d fix: 机器人摄像头端口号 2026-04-15 17:33:49 +08:00
7d6b7da157 fix: 临时放宽排空速度 2026-04-15 17:02:08 +08:00
df9a56af53 fix: 临时放宽排空速度 2026-04-15 16:48:12 +08:00
fd34330081 fix: 摄像头端口号固定 2026-04-15 16:35:28 +08:00
9b705dd8f8 fix: 摄像头端口号固定 2026-04-15 16:32:03 +08:00
c6484e847e chore: jazzy 2026-04-15 13:28:43 +08:00
fab2559980 chore: video26 2026-04-15 13:27:55 +08:00
9c2df9d674 fix: 删除视频侧 server_idle_reconnect_ms 误判检查 2026-04-14 22:44:36 +08:00
bd0a282344 fix: 删除视频侧 server_idle_reconnect_ms 误判检查 2026-04-14 22:31:33 +08:00
aa6235de5a fix: blitz-run-context.service 失败 2026-04-14 21:26:56 +08:00
1c2cf157d2 fix: blitz-run-context.service 失败 2026-04-14 21:13:27 +08:00
e895cdc9de feat: 日志增强功能 2026-04-14 20:52:41 +08:00
nnbcccscdscdsc
579e67a3db fix:5G不通走其他网络,5G通了走5G 2026-04-14 17:37:44 +08:00
nnbcccscdscdsc
ebb047c7b5 feat:新增gps监控服务(断开重连) 2026-04-14 15:16:11 +08:00
bb3e7b2989 fix: 视频程序也需要有 stale-session 检测 2026-04-14 13:22:19 +08:00
6ccd9e9fa1 test: b_side进程断开现象 2026-04-14 00:34:56 +08:00
71c026ccf3 fix: ip route在接口变化后也要更新,执行route repair 2026-04-14 00:15:25 +08:00
4805cc772d Merge branch 'c' of https://106.52.207.92:9103/limingjie/OmniSocketGo into c 2026-04-13 23:46:20 +08:00
3bbeaab0c3 fix: 修复模拟接口变更名字后(可能是掉网卡也可能不是),重恢复 2026-04-13 23:46:19 +08:00
nnbcccscdscdsc
a3d8835074 Merge branch 'c' of https://106.52.207.92:9103/limingjie/OmniSocketGo into c 2026-04-13 22:33:40 +08:00
nnbcccscdscdsc
947ecb2a2b fix: GPS采集端修复 2026-04-13 22:33:20 +08:00
25c68530ba feat: 自启动与自恢复机制 2026-04-13 21:55:40 +08:00
58 changed files with 4521 additions and 673 deletions

3
.gitignore vendored
View File

@@ -25,3 +25,6 @@ c/bin
ros-control-py/install ros-control-py/install
ros-control-py/log ros-control-py/log
scripts/boot/modem_network_info.json
logs/

View File

@@ -3,20 +3,37 @@
#include <signal.h> #include <signal.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdatomic.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <sys/stat.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/un.h> #include <sys/un.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
#include "cJSON.h"
#include "control_protocol.h" #include "control_protocol.h"
#include "latencylog.h"
#include "protocol.h" #include "protocol.h"
#include "video_pipeline.h" #include "video_pipeline.h"
#define CONTROL_DEFAULT_PEER_ID "peer-b-ctrl" #define CONTROL_DEFAULT_PEER_ID "peer-b-ctrl"
#define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl" #define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl"
#define CONTROL_ACK_DEFAULT_PEER_ID "peer-b-ctrl-ack"
#define CONTROL_ACK_DEFAULT_TARGET_PEER "peer-a-ctrl-ack"
#define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock" #define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock"
#define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000 #define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000
#define DEFAULT_RUNTIME_DIR "/run/blitz-robot"
#define DEFAULT_STATUS_FILE_NAME "b-side-omnid.status.json"
#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall"
#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall"
#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15
#define DEFAULT_KCP_STATS_INTERVAL_MS 1000
#define DEFAULT_CONTROL_LATENCY_SAMPLE_MOD 100
#define DEFAULT_CONTROL_ACK_SAMPLE_MOD 10
#define EXIT_CODE_VIDEO_THREAD_STALLED 101
#define EXIT_CODE_CONTROL_THREAD_STALLED 102
typedef struct unix_dgram_client { typedef struct unix_dgram_client {
int fd; int fd;
@@ -50,10 +67,32 @@ typedef struct daemon_state {
const char *control_bind_device; const char *control_bind_device;
const char *control_peer_id; const char *control_peer_id;
const char *control_expected_sender; const char *control_expected_sender;
const char *control_ack_peer_id;
const char *control_ack_target_peer;
const char *control_unix_socket; const char *control_unix_socket;
int control_server_idle_reconnect_ms; int control_server_idle_reconnect_ms;
const char *runtime_dir;
int heartbeat_timeout_sec;
int stats_interval_ms;
uint64_t control_latency_sample_mod;
uint64_t control_ack_sample_mod;
char status_file_path[512];
char video_thread_fault_file[512];
char control_thread_fault_file[512];
atomic_long video_thread_heartbeat_epoch_sec;
atomic_long control_thread_heartbeat_epoch_sec;
atomic_int control_ack_shutdown_requested;
kcp_session_stats_logger_t *stats_logger;
latency_logger_t *control_latency_logger;
video_stage_logger_t *video_stage_logger;
unix_dgram_client_t unix_client; unix_dgram_client_t unix_client;
control_bridge_stats_t control_stats; control_bridge_stats_t control_stats;
pthread_mutex_t control_ack_mutex;
pthread_t control_ack_thread;
kcp_client_t *control_ack_client;
int control_ack_thread_started;
int control_ack_connect_requested;
int control_ack_connect_inflight;
} daemon_state_t; } daemon_state_t;
static volatile sig_atomic_t g_stop_requested = 0; static volatile sig_atomic_t g_stop_requested = 0;
@@ -109,6 +148,120 @@ static int env_int_or_default(const char *name, int fallback) {
return parsed; return parsed;
} }
static uint64_t env_u64_or_default(const char *name, uint64_t fallback) {
const char *value = getenv(name);
unsigned long long parsed = 0ULL;
char *endptr = NULL;
if (value == NULL || value[0] == '\0') {
return fallback;
}
parsed = strtoull(value, &endptr, 10);
if (endptr == value || *endptr != '\0' || parsed == 0ULL) {
return fallback;
}
return (uint64_t) parsed;
}
static int64_t realtime_epoch_ms(void) {
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return (int64_t) ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
}
static long realtime_epoch_sec(void) {
return (long) time(NULL);
}
static void update_thread_heartbeat(atomic_long *heartbeat) {
if (heartbeat == NULL) {
return;
}
atomic_store(heartbeat, realtime_epoch_sec());
}
static int should_log_control_latency(const daemon_state_t *state, const message_t *msg) {
uint64_t sample_mod;
if (state == NULL || state->control_latency_logger == NULL || msg == NULL) {
return 0;
}
sample_mod = state->control_latency_sample_mod;
if (sample_mod <= 1U) {
return 1;
}
return msg->id % sample_mod == 0U;
}
static int should_send_control_ack(const daemon_state_t *state, const message_t *msg) {
uint64_t sample_mod;
if (state == NULL || msg == NULL) {
return 0;
}
sample_mod = state->control_ack_sample_mod;
if (sample_mod <= 1U) {
return 1;
}
return msg->id % sample_mod == 0U;
}
static void video_pipeline_heartbeat_progress(void *context) {
update_thread_heartbeat((atomic_long *) context);
}
static int ensure_runtime_dir(const char *runtime_dir) {
struct stat st;
if (runtime_dir == NULL || runtime_dir[0] == '\0') {
errno = EINVAL;
return -1;
}
if (stat(runtime_dir, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
return 0;
}
errno = ENOTDIR;
return -1;
}
if (errno != ENOENT) {
return -1;
}
if (mkdir(runtime_dir, 0775) != 0 && errno != EEXIST) {
return -1;
}
return 0;
}
static int path_exists(const char *path) {
return path != NULL && path[0] != '\0' && access(path, F_OK) == 0;
}
static int consume_fault_flag(const char *path) {
if (!path_exists(path)) {
return 0;
}
unlink(path);
return 1;
}
static void maybe_inject_thread_stall(daemon_state_t *state, const char *fault_path, const char *thread_name) {
if (state == NULL || fault_path == NULL || thread_name == NULL) {
return;
}
if (!consume_fault_flag(fault_path)) {
return;
}
fprintf(
stderr,
"[b_side_omnid] fault injection requested for %s thread, sleeping past %d second heartbeat timeout\n",
thread_name,
state->heartbeat_timeout_sec
);
sleep((unsigned int) state->heartbeat_timeout_sec + 2U);
}
static int control_bridge_stats_init(control_bridge_stats_t *stats) { static int control_bridge_stats_init(control_bridge_stats_t *stats) {
int rc; int rc;
if (stats == NULL) { if (stats == NULL) {
@@ -132,6 +285,196 @@ static void control_bridge_stats_destroy(control_bridge_stats_t *stats) {
} }
static void unix_dgram_client_close(unix_dgram_client_t *client); static void unix_dgram_client_close(unix_dgram_client_t *client);
static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats);
static void close_control_ack_client(kcp_client_t **client_ptr);
static int control_ack_enabled(const daemon_state_t *state) {
return state != NULL
&& state->control_ack_peer_id != NULL
&& state->control_ack_peer_id[0] != '\0'
&& state->control_ack_target_peer != NULL
&& state->control_ack_target_peer[0] != '\0';
}
static int control_ack_manager_init(daemon_state_t *state) {
int rc;
if (state == NULL) {
errno = EINVAL;
return -1;
}
rc = pthread_mutex_init(&state->control_ack_mutex, NULL);
if (rc != 0) {
errno = rc;
return -1;
}
atomic_init(&state->control_ack_shutdown_requested, 0);
state->control_ack_client = NULL;
state->control_ack_thread_started = 0;
state->control_ack_connect_requested = 0;
state->control_ack_connect_inflight = 0;
return 0;
}
static void control_ack_manager_reset(daemon_state_t *state, int request_connect) {
kcp_client_t *client = NULL;
if (state == NULL) {
return;
}
pthread_mutex_lock(&state->control_ack_mutex);
client = state->control_ack_client;
state->control_ack_client = NULL;
state->control_ack_connect_requested = request_connect && control_ack_enabled(state) && state->control_ack_thread_started;
pthread_mutex_unlock(&state->control_ack_mutex);
close_control_ack_client(&client);
}
static void control_ack_manager_destroy(daemon_state_t *state) {
if (state == NULL) {
return;
}
atomic_store(&state->control_ack_shutdown_requested, 1);
if (state->control_ack_thread_started) {
pthread_join(state->control_ack_thread, NULL);
state->control_ack_thread_started = 0;
}
control_ack_manager_reset(state, 0);
pthread_mutex_destroy(&state->control_ack_mutex);
}
static int write_status_json_atomic(const char *path, cJSON *root) {
char *json;
char temp_path[640];
FILE *file;
size_t json_len;
if (path == NULL || root == NULL) {
errno = EINVAL;
return -1;
}
json = cJSON_PrintUnformatted(root);
if (json == NULL) {
errno = ENOMEM;
return -1;
}
snprintf(temp_path, sizeof(temp_path), "%s.tmp.%ld", path, (long) getpid());
file = fopen(temp_path, "wb");
if (file == NULL) {
cJSON_free(json);
return -1;
}
json_len = strlen(json);
if (fwrite(json, 1, json_len, file) != json_len || fflush(file) != 0) {
int saved_errno = errno;
fclose(file);
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
if (fclose(file) != 0) {
int saved_errno = errno;
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
if (rename(temp_path, path) != 0) {
int saved_errno = errno;
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
cJSON_free(json);
return 0;
}
static int write_daemon_status_file(daemon_state_t *state) {
cJSON *root;
video_pipeline_stats_t video_stats;
control_bridge_stats_t control_stats;
int rc;
if (state == NULL) {
errno = EINVAL;
return -1;
}
if (ensure_runtime_dir(state->runtime_dir) != 0) {
return -1;
}
memset(&video_stats, 0, sizeof(video_stats));
memset(&control_stats, 0, sizeof(control_stats));
video_pipeline_stats_snapshot(&state->video_stats, &video_stats);
control_bridge_stats_snapshot(&state->control_stats, &control_stats);
root = cJSON_CreateObject();
if (root == NULL) {
errno = ENOMEM;
return -1;
}
cJSON_AddNumberToObject(root, "updated_at_epoch_ms", (double) realtime_epoch_ms());
cJSON_AddNumberToObject(root, "pid", (double) getpid());
cJSON_AddNumberToObject(root, "video_thread_heartbeat_epoch_ms", (double) atomic_load(&state->video_thread_heartbeat_epoch_sec) * 1000.0);
cJSON_AddNumberToObject(root, "control_thread_heartbeat_epoch_ms", (double) atomic_load(&state->control_thread_heartbeat_epoch_sec) * 1000.0);
cJSON_AddBoolToObject(root, "video_connected", video_stats.connected != 0);
cJSON_AddNumberToObject(root, "video_frames_sent", (double) video_stats.frames_sent);
cJSON_AddNumberToObject(root, "video_send_errors", (double) video_stats.send_errors);
cJSON_AddNumberToObject(root, "video_backlog_resets", (double) video_stats.backlog_resets);
cJSON_AddNumberToObject(root, "video_last_capture_to_send_ms", (double) video_stats.last_capture_to_send_ms);
cJSON_AddNumberToObject(root, "video_avg_capture_to_send_ms", video_stats.avg_capture_to_send_ms);
cJSON_AddStringToObject(root, "video_last_error", video_stats.last_error);
cJSON_AddBoolToObject(root, "control_registered", control_stats.registered != 0);
cJSON_AddNumberToObject(root, "control_reconnect_count", (double) control_stats.reconnect_count);
cJSON_AddNumberToObject(root, "control_unix_send_errors", (double) control_stats.unix_send_errors);
cJSON_AddStringToObject(root, "control_last_error", control_stats.last_error);
rc = write_status_json_atomic(state->status_file_path, root);
cJSON_Delete(root);
return rc;
}
static int thread_heartbeat_expired(atomic_long *heartbeat, int timeout_sec, long now_sec) {
long heartbeat_sec;
if (heartbeat == NULL || timeout_sec <= 0) {
return 0;
}
heartbeat_sec = atomic_load(heartbeat);
if (heartbeat_sec <= 0) {
return 0;
}
return now_sec - heartbeat_sec > timeout_sec;
}
static void exit_if_thread_stalled(daemon_state_t *state) {
long now_sec;
if (state == NULL || state->heartbeat_timeout_sec <= 0) {
return;
}
now_sec = realtime_epoch_sec();
if (thread_heartbeat_expired(&state->video_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
fprintf(stderr, "[b_side_omnid] video thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
fflush(stderr);
exit(EXIT_CODE_VIDEO_THREAD_STALLED);
}
if (thread_heartbeat_expired(&state->control_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
fprintf(stderr, "[b_side_omnid] control thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
fflush(stderr);
exit(EXIT_CODE_CONTROL_THREAD_STALLED);
}
}
static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) { static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) {
if (stats == NULL) { if (stats == NULL) {
@@ -209,6 +552,147 @@ static void control_message_body_to_cstr(const message_t *msg, char *buffer, siz
buffer[copy_len] = '\0'; buffer[copy_len] = '\0';
} }
static kcp_client_t *connect_control_ack_client(const daemon_state_t *state) {
kcp_conn_options_t options;
if (state == NULL || state->control_ack_peer_id == NULL || state->control_ack_peer_id[0] == '\0') {
errno = EINVAL;
return NULL;
}
kcp_conn_options_set_control_defaults(&options);
return kcp_client_dial_with_options(
state->control_server_addr,
state->control_relay_via,
state->control_ack_peer_id,
state->control_bind_ip,
state->control_bind_device,
&options,
NULL,
NULL,
state->stats_logger,
state->stats_interval_ms
);
}
static void close_control_ack_client(kcp_client_t **client_ptr) {
if (client_ptr == NULL || *client_ptr == NULL) {
return;
}
kcp_client_close(*client_ptr);
kcp_client_free(*client_ptr);
*client_ptr = NULL;
}
static void control_ack_manager_request_connect(daemon_state_t *state) {
if (state == NULL || !control_ack_enabled(state) || !state->control_ack_thread_started) {
return;
}
pthread_mutex_lock(&state->control_ack_mutex);
if (state->control_ack_client == NULL) {
state->control_ack_connect_requested = 1;
}
pthread_mutex_unlock(&state->control_ack_mutex);
}
static void *control_ack_thread_main(void *arg) {
daemon_state_t *state = (daemon_state_t *) arg;
while (!atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested) {
kcp_client_t *client = NULL;
int connect_failed = 0;
int should_connect = 0;
pthread_mutex_lock(&state->control_ack_mutex);
if (state->control_ack_connect_requested && state->control_ack_client == NULL && !state->control_ack_connect_inflight) {
state->control_ack_connect_inflight = 1;
should_connect = 1;
}
pthread_mutex_unlock(&state->control_ack_mutex);
if (!should_connect) {
usleep(200000);
continue;
}
client = connect_control_ack_client(state);
connect_failed = client == NULL;
pthread_mutex_lock(&state->control_ack_mutex);
state->control_ack_connect_inflight = 0;
if (
client != NULL
&& state->control_ack_connect_requested
&& state->control_ack_client == NULL
&& !atomic_load(&state->control_ack_shutdown_requested)
&& !*state->stop_requested
) {
state->control_ack_client = client;
state->control_ack_connect_requested = 0;
client = NULL;
}
pthread_mutex_unlock(&state->control_ack_mutex);
if (client != NULL) {
close_control_ack_client(&client);
}
if (connect_failed && !atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested) {
sleep(1);
}
}
return NULL;
}
static void maybe_send_control_ack(
daemon_state_t *state,
const message_t *msg,
int64_t recv_unix_nano,
int64_t persist_end_unix_nano,
const char *sample_reason
) {
kcp_client_t *ack_client = NULL;
kcp_client_t *client_to_close = NULL;
char *payload = NULL;
int send_rc = -1;
if (
state == NULL || msg == NULL || recv_unix_nano <= 0 || persist_end_unix_nano <= recv_unix_nano
|| !control_ack_enabled(state) || !state->control_ack_thread_started
) {
return;
}
payload = omni_strdup_printf(
"{\"message_id\":%" PRIu64 ",\"ack_phase\":\"persist_end\",\"b_recv_to_persist_us\":%" PRId64 ",\"unix_send_ok\":true,\"sample_reason\":\"%s\"}",
msg->id,
(persist_end_unix_nano - recv_unix_nano) / 1000,
sample_reason == NULL ? "sample_mod" : sample_reason
);
if (payload == NULL) {
return;
}
pthread_mutex_lock(&state->control_ack_mutex);
ack_client = state->control_ack_client;
if (ack_client == NULL) {
state->control_ack_connect_requested = 1;
pthread_mutex_unlock(&state->control_ack_mutex);
free(payload);
return;
}
send_rc = kcp_client_send_text(ack_client, state->control_ack_target_peer, payload);
if (send_rc != 0) {
client_to_close = state->control_ack_client;
state->control_ack_client = NULL;
state->control_ack_connect_requested = 1;
}
pthread_mutex_unlock(&state->control_ack_mutex);
free(payload);
if (client_to_close != NULL) {
close_control_ack_client(&client_to_close);
}
}
static int unix_dgram_client_init(unix_dgram_client_t *client, const char *dest_path) { static int unix_dgram_client_init(unix_dgram_client_t *client, const char *dest_path) {
struct sockaddr_un bind_addr; struct sockaddr_un bind_addr;
pid_t pid; pid_t pid;
@@ -295,7 +779,10 @@ static void *video_thread_main(void *arg) {
daemon_state_t *state = (daemon_state_t *) arg; daemon_state_t *state = (daemon_state_t *) arg;
while (!*state->stop_requested) { while (!*state->stop_requested) {
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
maybe_inject_thread_stall(state, state->video_thread_fault_file, "video");
int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested); int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested);
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
if (video_rc == 0) { if (video_rc == 0) {
break; break;
@@ -318,6 +805,8 @@ static void *control_thread_main(void *arg) {
kcp_client_t *client = NULL; kcp_client_t *client = NULL;
int reconnect_immediately = 0; int reconnect_immediately = 0;
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
maybe_inject_thread_stall(state, state->control_thread_fault_file, "control");
kcp_conn_options_set_control_defaults(&options); kcp_conn_options_set_control_defaults(&options);
client = kcp_client_dial_with_options( client = kcp_client_dial_with_options(
state->control_server_addr, state->control_server_addr,
@@ -328,8 +817,8 @@ static void *control_thread_main(void *arg) {
&options, &options,
NULL, NULL,
NULL, NULL,
NULL, state->stats_logger,
KCP_DEFAULT_STATS_INTERVAL_MS state->stats_interval_ms
); );
if (client == NULL) { if (client == NULL) {
control_bridge_set_errno_error(&state->control_stats, "failed to connect control session"); control_bridge_set_errno_error(&state->control_stats, "failed to connect control session");
@@ -355,14 +844,22 @@ static void *control_thread_main(void *arg) {
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex); pthread_mutex_unlock(&state->control_stats.mutex);
} }
control_ack_manager_request_connect(state);
while (!*state->stop_requested) { while (!*state->stop_requested) {
message_t msg; message_t msg;
int rc; int rc;
kcp_client_state_t client_state; kcp_client_state_t client_state;
int ack_sampled = 0;
int log_control_latency = 0;
int64_t recv_unix_nano = 0;
int64_t persist_begin_unix_nano = 0;
int64_t persist_end_unix_nano = 0;
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
protocol_message_init(&msg); protocol_message_init(&msg);
rc = kcp_client_receive_timed(client, &msg, 100); rc = kcp_client_receive_timed(client, &msg, 100);
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
if (rc == 1) { if (rc == 1) {
char reconnect_reason[256]; char reconnect_reason[256];
@@ -462,6 +959,29 @@ static void *control_thread_main(void *arg) {
continue; continue;
} }
ack_sampled = should_send_control_ack(state, &msg);
log_control_latency = ack_sampled || should_log_control_latency(state, &msg);
if (log_control_latency) {
recv_unix_nano = omni_now_unix_nano();
persist_begin_unix_nano = recv_unix_nano;
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_APP_RECV,
recv_unix_nano,
&msg
);
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_PERSIST_BEGIN,
persist_begin_unix_nano,
&msg
);
}
if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) { if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) {
int send_errno = errno; int send_errno = errno;
int recovered = 0; int recovered = 0;
@@ -478,6 +998,20 @@ static void *control_thread_main(void *arg) {
state->control_stats.server_idle_ms = client_state.server_idle_ms; state->control_stats.server_idle_ms = client_state.server_idle_ms;
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex); pthread_mutex_unlock(&state->control_stats.mutex);
if (log_control_latency) {
persist_end_unix_nano = omni_now_unix_nano();
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_PERSIST_END,
persist_end_unix_nano,
&msg
);
}
if (ack_sampled) {
maybe_send_control_ack(state, &msg, recv_unix_nano, persist_end_unix_nano, "sample_mod");
}
protocol_message_clear(&msg); protocol_message_clear(&msg);
continue; continue;
} }
@@ -498,6 +1032,20 @@ static void *control_thread_main(void *arg) {
state->control_stats.server_idle_ms = client_state.server_idle_ms; state->control_stats.server_idle_ms = client_state.server_idle_ms;
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex); pthread_mutex_unlock(&state->control_stats.mutex);
if (log_control_latency) {
persist_end_unix_nano = omni_now_unix_nano();
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_PERSIST_END,
persist_end_unix_nano,
&msg
);
}
if (ack_sampled) {
maybe_send_control_ack(state, &msg, recv_unix_nano, persist_end_unix_nano, "sample_mod");
}
protocol_message_clear(&msg); protocol_message_clear(&msg);
} }
@@ -505,6 +1053,7 @@ static void *control_thread_main(void *arg) {
state->control_stats.registered = 0; state->control_stats.registered = 0;
state->control_stats.server_idle_ms = 0; state->control_stats.server_idle_ms = 0;
pthread_mutex_unlock(&state->control_stats.mutex); pthread_mutex_unlock(&state->control_stats.mutex);
control_ack_manager_reset(state, 0);
kcp_client_close(client); kcp_client_close(client);
kcp_client_free(client); kcp_client_free(client);
if (!*state->stop_requested && !reconnect_immediately) { if (!*state->stop_requested && !reconnect_immediately) {
@@ -526,13 +1075,15 @@ static void print_stats(daemon_state_t *state) {
fprintf( fprintf(
stderr, stderr,
"[b_side_omnid] video registered=%d frames=%llu bytes=%llu drops=%llu resets=%llu backlog=%u reason=%s srtt=%dms | control registered=%d idle=%ums reconnects=%llu forwarded=%llu invalid=%llu unix_err=%llu srtt=%dms last_reconnect=%s\n", "[b_side_omnid] video registered=%d frames=%llu bytes=%llu drops=%llu resets=%llu backlog=%u cap2send=%ums avg=%.1fms reason=%s srtt=%dms | control registered=%d idle=%ums reconnects=%llu forwarded=%llu invalid=%llu unix_err=%llu srtt=%dms last_reconnect=%s\n",
video_stats.connected, video_stats.connected,
(unsigned long long) video_stats.frames_sent, (unsigned long long) video_stats.frames_sent,
(unsigned long long) video_stats.bytes_sent, (unsigned long long) video_stats.bytes_sent,
(unsigned long long) video_stats.backpressure_drops, (unsigned long long) video_stats.backpressure_drops,
(unsigned long long) video_stats.backlog_resets, (unsigned long long) video_stats.backlog_resets,
video_stats.last_backlog_segments, video_stats.last_backlog_segments,
video_stats.last_capture_to_send_ms,
video_stats.avg_capture_to_send_ms,
video_stats.last_backlog_reason[0] == '\0' ? "-" : video_stats.last_backlog_reason, video_stats.last_backlog_reason[0] == '\0' ? "-" : video_stats.last_backlog_reason,
video_stats.transport.srtt_ms, video_stats.transport.srtt_ms,
control_stats.registered, control_stats.registered,
@@ -550,6 +1101,7 @@ int main(void) {
daemon_state_t state; daemon_state_t state;
pthread_t video_thread; pthread_t video_thread;
pthread_t control_thread; pthread_t control_thread;
long initial_heartbeat;
memset(&state, 0, sizeof(state)); memset(&state, 0, sizeof(state));
state.stop_requested = &g_stop_requested; state.stop_requested = &g_stop_requested;
@@ -562,11 +1114,44 @@ int main(void) {
state.control_bind_device = env_first_nonempty("OMNI_CONTROL_BIND_DEVICE", "OMNISOCKET_BIND_DEVICE", ""); state.control_bind_device = env_first_nonempty("OMNI_CONTROL_BIND_DEVICE", "OMNISOCKET_BIND_DEVICE", "");
state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID); state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID);
state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER); state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER);
state.control_ack_peer_id = env_or_default("OMNI_CONTROL_ACK_PEER_ID", CONTROL_ACK_DEFAULT_PEER_ID);
state.control_ack_target_peer = env_or_default("OMNI_CONTROL_ACK_TARGET_PEER", CONTROL_ACK_DEFAULT_TARGET_PEER);
state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET); state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET);
state.runtime_dir = env_or_default("BLITZ_RUNTIME_DIR", DEFAULT_RUNTIME_DIR);
state.heartbeat_timeout_sec = env_int_or_default(
"BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC",
DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC
);
state.stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", DEFAULT_KCP_STATS_INTERVAL_MS);
state.control_latency_sample_mod = env_u64_or_default("BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD", DEFAULT_CONTROL_LATENCY_SAMPLE_MOD);
state.control_ack_sample_mod = env_u64_or_default("BLITZ_CONTROL_ACK_SAMPLE_MOD", DEFAULT_CONTROL_ACK_SAMPLE_MOD);
state.video_config.progress_callback = video_pipeline_heartbeat_progress;
state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec;
state.video_config.stats_logger = NULL;
state.video_config.stage_logger = NULL;
state.video_config.stats_interval_ms = state.stats_interval_ms;
state.control_server_idle_reconnect_ms = env_int_or_default( state.control_server_idle_reconnect_ms = env_int_or_default(
"OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS", "OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS",
CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS
); );
snprintf(state.status_file_path, sizeof(state.status_file_path), "%s/%s", state.runtime_dir, DEFAULT_STATUS_FILE_NAME);
snprintf(
state.video_thread_fault_file,
sizeof(state.video_thread_fault_file),
"%s/%s",
state.runtime_dir,
DEFAULT_VIDEO_THREAD_FAULT_FILE
);
snprintf(
state.control_thread_fault_file,
sizeof(state.control_thread_fault_file),
"%s/%s",
state.runtime_dir,
DEFAULT_CONTROL_THREAD_FAULT_FILE
);
initial_heartbeat = realtime_epoch_sec();
atomic_init(&state.video_thread_heartbeat_epoch_sec, initial_heartbeat);
atomic_init(&state.control_thread_heartbeat_epoch_sec, initial_heartbeat);
if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' || if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' ||
state.control_server_addr == NULL || state.control_server_addr[0] == '\0') { state.control_server_addr == NULL || state.control_server_addr[0] == '\0') {
@@ -583,8 +1168,15 @@ int main(void) {
video_pipeline_stats_destroy(&state.video_stats); video_pipeline_stats_destroy(&state.video_stats);
return 1; return 1;
} }
if (control_ack_manager_init(&state) != 0) {
perror("control_ack_manager_init");
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
return 1;
}
if (unix_dgram_client_init(&state.unix_client, state.control_unix_socket) != 0) { if (unix_dgram_client_init(&state.unix_client, state.control_unix_socket) != 0) {
perror("unix_dgram_client_init"); perror("unix_dgram_client_init");
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats); control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats); video_pipeline_stats_destroy(&state.video_stats);
return 1; return 1;
@@ -599,16 +1191,60 @@ int main(void) {
if (install_signal_handler(SIGINT) != 0 || install_signal_handler(SIGTERM) != 0) { if (install_signal_handler(SIGINT) != 0 || install_signal_handler(SIGTERM) != 0) {
perror("install_signal_handler"); perror("install_signal_handler");
unix_dgram_client_close(&state.unix_client); unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats); control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats); video_pipeline_stats_destroy(&state.video_stats);
return 1; return 1;
} }
{
const char *stats_log_path = getenv("BLITZ_KCP_STATS_LOG_PATH");
const char *latency_log_path = getenv("BLITZ_CONTROL_LATENCY_LOG_PATH");
const char *video_stage_log_path = getenv("BLITZ_VIDEO_STAGE_LOG_PATH");
int latency_enabled = env_int_or_default("BLITZ_CONTROL_LATENCY_LOG_ENABLED", 1);
int video_stage_log_enabled = env_int_or_default("BLITZ_VIDEO_STAGE_LOG_ENABLED", 1);
uint64_t video_stage_log_sample_mod = env_u64_or_default("BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD", 10);
if (stats_log_path != NULL && stats_log_path[0] != '\0') {
state.stats_logger = kcp_session_stats_open_jsonl(stats_log_path);
if (state.stats_logger == NULL) {
fprintf(stderr, "[b_side_omnid] warning: failed to open KCP stats log %s\n", stats_log_path);
}
}
if (latency_enabled && latency_log_path != NULL && latency_log_path[0] != '\0') {
state.control_latency_logger = latencylog_open_jsonl(latency_log_path);
if (state.control_latency_logger == NULL) {
fprintf(stderr, "[b_side_omnid] warning: failed to open control latency log %s\n", latency_log_path);
}
}
if (video_stage_log_enabled && video_stage_log_path != NULL && video_stage_log_path[0] != '\0') {
state.video_stage_logger = video_stage_logger_open_jsonl(video_stage_log_path, video_stage_log_sample_mod);
if (state.video_stage_logger == NULL) {
fprintf(stderr, "[b_side_omnid] warning: failed to open video stage log %s\n", video_stage_log_path);
}
}
state.video_config.stats_logger = state.stats_logger;
state.video_config.stage_logger = state.video_stage_logger;
state.video_config.stats_interval_ms = state.stats_interval_ms;
}
if (control_ack_enabled(&state)) {
if (pthread_create(&state.control_ack_thread, NULL, control_ack_thread_main, &state) != 0) {
fprintf(stderr, "[b_side_omnid] warning: failed to start async control ACK manager, ACK sampling disabled\n");
} else {
state.control_ack_thread_started = 1;
}
}
if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) { if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) {
perror("pthread_create(video_thread)"); perror("pthread_create(video_thread)");
unix_dgram_client_close(&state.unix_client); unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats); control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats); video_pipeline_stats_destroy(&state.video_stats);
latencylog_close(state.control_latency_logger);
video_stage_logger_close(state.video_stage_logger);
kcp_session_stats_close(state.stats_logger);
return 1; return 1;
} }
if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) { if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) {
@@ -616,20 +1252,32 @@ int main(void) {
g_stop_requested = 1; g_stop_requested = 1;
pthread_join(video_thread, NULL); pthread_join(video_thread, NULL);
unix_dgram_client_close(&state.unix_client); unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats); control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats); video_pipeline_stats_destroy(&state.video_stats);
latencylog_close(state.control_latency_logger);
video_stage_logger_close(state.video_stage_logger);
kcp_session_stats_close(state.stats_logger);
return 1; return 1;
} }
while (!g_stop_requested) { while (!g_stop_requested) {
sleep(1); sleep(1);
print_stats(&state); print_stats(&state);
if (write_daemon_status_file(&state) != 0) {
fprintf(stderr, "[b_side_omnid] failed to write status file %s: %s\n", state.status_file_path, strerror(errno));
}
exit_if_thread_stalled(&state);
} }
pthread_join(video_thread, NULL); pthread_join(video_thread, NULL);
pthread_join(control_thread, NULL); pthread_join(control_thread, NULL);
unix_dgram_client_close(&state.unix_client); unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats); control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats); video_pipeline_stats_destroy(&state.video_stats);
latencylog_close(state.control_latency_logger);
video_stage_logger_close(state.video_stage_logger);
kcp_session_stats_close(state.stats_logger);
return 0; return 0;
} }

View File

@@ -24,8 +24,12 @@ typedef struct kcp_session_stats_record {
uint32_t rto_ms; uint32_t rto_ms;
int has_srtt_ms; int has_srtt_ms;
int32_t srtt_ms; int32_t srtt_ms;
int has_min_srtt_ms;
int32_t min_srtt_ms;
int has_srttvar_ms; int has_srttvar_ms;
int32_t srttvar_ms; int32_t srttvar_ms;
int has_last_feedback_age_ms;
uint32_t last_feedback_age_ms;
int has_snd_wnd; int has_snd_wnd;
uint32_t snd_wnd; uint32_t snd_wnd;
int has_rmt_wnd; int has_rmt_wnd;

View File

@@ -3,6 +3,7 @@
#include <errno.h> #include <errno.h>
#include <inttypes.h> #include <inttypes.h>
#include <limits.h>
#include <pthread.h> #include <pthread.h>
#include <stdarg.h> #include <stdarg.h>
#include <stdbool.h> #include <stdbool.h>
@@ -31,6 +32,15 @@
typedef struct omni_file_logger { typedef struct omni_file_logger {
FILE *file; FILE *file;
pthread_mutex_t mutex; pthread_mutex_t mutex;
char path[PATH_MAX];
size_t current_bytes;
size_t buffered_bytes;
size_t flush_bytes;
size_t max_bytes;
int flush_interval_ms;
int max_files;
int immediate_flush;
uint64_t last_flush_monotonic_ms;
} omni_file_logger_t; } omni_file_logger_t;
int64_t omni_now_unix_nano(void); int64_t omni_now_unix_nano(void);
@@ -61,6 +71,7 @@ double omni_duration_ms_to_ns(double ms);
const char *omni_path_base_name(const char *path); const char *omni_path_base_name(const char *path);
void omni_file_logger_init(omni_file_logger_t *logger, FILE *file); void omni_file_logger_init(omni_file_logger_t *logger, FILE *file);
void omni_file_logger_init_path(omni_file_logger_t *logger, FILE *file, const char *path, int immediate_flush);
void omni_file_logger_destroy(omni_file_logger_t *logger); void omni_file_logger_destroy(omni_file_logger_t *logger);
int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line); int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line);

View File

@@ -28,6 +28,7 @@ kcp_client_t *kcp_client_dial(const char *server_addr, const char *dial_addr, co
const char *kcp_client_id(const kcp_client_t *client); const char *kcp_client_id(const kcp_client_t *client);
int kcp_client_send_text(kcp_client_t *client, const char *to, const char *text); int kcp_client_send_text(kcp_client_t *client, const char *to, const char *text);
int kcp_client_send_binary(kcp_client_t *client, const char *to, const void *data, size_t data_len); int kcp_client_send_binary(kcp_client_t *client, const char *to, const void *data, size_t data_len);
int kcp_client_send_binary_with_id(kcp_client_t *client, const char *to, const void *data, size_t data_len, uint64_t *out_id);
int kcp_client_send_file_path(kcp_client_t *client, const char *to, const char *path); int kcp_client_send_file_path(kcp_client_t *client, const char *to, const char *path);
int kcp_client_receive_timed(kcp_client_t *client, message_t *out_msg, int timeout_ms); int kcp_client_receive_timed(kcp_client_t *client, message_t *out_msg, int timeout_ms);
int kcp_client_receive(kcp_client_t *client, message_t *out_msg); int kcp_client_receive(kcp_client_t *client, message_t *out_msg);

View File

@@ -56,7 +56,9 @@ typedef struct kcp_runtime_stats {
uint32_t conv; uint32_t conv;
uint32_t rto_ms; uint32_t rto_ms;
int32_t srtt_ms; int32_t srtt_ms;
int32_t min_srtt_ms;
int32_t srttvar_ms; int32_t srttvar_ms;
uint32_t last_feedback_age_ms;
uint32_t snd_wnd; uint32_t snd_wnd;
uint32_t rmt_wnd; uint32_t rmt_wnd;
uint32_t inflight; uint32_t inflight;

View File

@@ -6,20 +6,34 @@
#include <stdint.h> #include <stdint.h>
#include "gps_buffer.h" #include "gps_buffer.h"
#include "omni_common.h"
#include "peer_kcp_client.h" #include "peer_kcp_client.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
#if defined(__GNUC__)
typedef struct __attribute__((packed)) video_pipeline_packet_metadata {
#else
typedef struct video_pipeline_packet_metadata { typedef struct video_pipeline_packet_metadata {
#endif
uint64_t timestamp_ms; uint64_t timestamp_ms;
double latitude; double latitude;
double longitude; double longitude;
uint32_t capture_to_send_ms;
} video_pipeline_packet_metadata_t; } video_pipeline_packet_metadata_t;
typedef struct video_stage_logger {
omni_file_logger_t file_logger;
int enabled;
uint64_t sample_mod;
} video_stage_logger_t;
typedef void (*video_pipeline_progress_fn)(void *context);
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
_Static_assert(sizeof(video_pipeline_packet_metadata_t) == 24, "video trailer metadata must be 24 bytes"); _Static_assert(sizeof(video_pipeline_packet_metadata_t) == 28, "video trailer metadata must be 28 bytes");
#endif #endif
typedef struct video_pipeline_config { typedef struct video_pipeline_config {
@@ -39,6 +53,12 @@ typedef struct video_pipeline_config {
int soft_backpressure_segments; int soft_backpressure_segments;
int hard_backpressure_segments; int hard_backpressure_segments;
int hard_backpressure_hold_ms; int hard_backpressure_hold_ms;
int frame_stall_reconnect_ms;
kcp_session_stats_logger_t *stats_logger;
video_stage_logger_t *stage_logger;
int stats_interval_ms;
video_pipeline_progress_fn progress_callback;
void *progress_context;
} video_pipeline_config_t; } video_pipeline_config_t;
typedef struct video_pipeline_stats { typedef struct video_pipeline_stats {
@@ -50,6 +70,8 @@ typedef struct video_pipeline_stats {
uint64_t backlog_resets; uint64_t backlog_resets;
uint64_t last_frame_bytes; uint64_t last_frame_bytes;
uint32_t last_backlog_segments; uint32_t last_backlog_segments;
uint32_t last_capture_to_send_ms;
double avg_capture_to_send_ms;
int connected; int connected;
char last_error[256]; char last_error[256];
char last_backlog_reason[128]; char last_backlog_reason[128];
@@ -63,6 +85,8 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config);
int video_pipeline_stats_init(video_pipeline_stats_t *stats); int video_pipeline_stats_init(video_pipeline_stats_t *stats);
void video_pipeline_stats_destroy(video_pipeline_stats_t *stats); void video_pipeline_stats_destroy(video_pipeline_stats_t *stats);
void video_pipeline_stats_snapshot(video_pipeline_stats_t *stats, video_pipeline_stats_t *out_stats); void video_pipeline_stats_snapshot(video_pipeline_stats_t *stats, video_pipeline_stats_t *out_stats);
video_stage_logger_t *video_stage_logger_open_jsonl(const char *path, uint64_t sample_mod);
void video_stage_logger_close(video_stage_logger_t *logger);
int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_stats_t *stats, volatile sig_atomic_t *stop_requested); int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_stats_t *stats, volatile sig_atomic_t *stop_requested);
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -119,7 +119,9 @@ static PyObject *build_kcp_stats_dict(const omnisocket_session_kcp_stats_t *stat
SET_KCP_STAT("conv", PyLong_FromUnsignedLong(stats->conv)); SET_KCP_STAT("conv", PyLong_FromUnsignedLong(stats->conv));
SET_KCP_STAT("rto_ms", PyLong_FromUnsignedLong(stats->rto_ms)); SET_KCP_STAT("rto_ms", PyLong_FromUnsignedLong(stats->rto_ms));
SET_KCP_STAT("srtt_ms", PyLong_FromLong(stats->srtt_ms)); SET_KCP_STAT("srtt_ms", PyLong_FromLong(stats->srtt_ms));
SET_KCP_STAT("min_srtt_ms", PyLong_FromLong(stats->min_srtt_ms));
SET_KCP_STAT("srttvar_ms", PyLong_FromLong(stats->srttvar_ms)); SET_KCP_STAT("srttvar_ms", PyLong_FromLong(stats->srttvar_ms));
SET_KCP_STAT("last_feedback_age_ms", PyLong_FromUnsignedLong(stats->last_feedback_age_ms));
SET_KCP_STAT("snd_wnd", PyLong_FromUnsignedLong(stats->snd_wnd)); SET_KCP_STAT("snd_wnd", PyLong_FromUnsignedLong(stats->snd_wnd));
SET_KCP_STAT("rmt_wnd", PyLong_FromUnsignedLong(stats->rmt_wnd)); SET_KCP_STAT("rmt_wnd", PyLong_FromUnsignedLong(stats->rmt_wnd));
SET_KCP_STAT("inflight", PyLong_FromUnsignedLong(stats->inflight)); SET_KCP_STAT("inflight", PyLong_FromUnsignedLong(stats->inflight));
@@ -279,6 +281,29 @@ static PyObject *PyOmniSession_send(PyOmniSession *self, PyObject *args, PyObjec
Py_RETURN_NONE; Py_RETURN_NONE;
} }
static PyObject *PyOmniSession_send_with_id(PyOmniSession *self, PyObject *args, PyObject *kwargs) {
const char *to;
Py_buffer payload;
int rc;
uint64_t message_id = 0;
static char *kwlist[] = {"to", "data", NULL};
memset(&payload, 0, sizeof(payload));
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "sy*", kwlist, &to, &payload)) {
return NULL;
}
Py_BEGIN_ALLOW_THREADS
rc = omnisocket_session_send_with_id(&self->session, to, payload.buf, (size_t) payload.len, &message_id);
Py_END_ALLOW_THREADS
PyBuffer_Release(&payload);
if (rc != 0) {
return PyErr_SetFromErrno(PyExc_OSError);
}
return PyLong_FromUnsignedLongLong((unsigned long long) message_id);
}
static PyObject *PyOmniSession_recv(PyOmniSession *self, PyObject *args, PyObject *kwargs) { static PyObject *PyOmniSession_recv(PyOmniSession *self, PyObject *args, PyObject *kwargs) {
int timeout_ms = -1; int timeout_ms = -1;
int rc; int rc;
@@ -379,6 +404,7 @@ static PyMethodDef PyOmniSession_methods[] = {
{"connect", (PyCFunction) PyOmniSession_connect, METH_VARARGS | METH_KEYWORDS, NULL}, {"connect", (PyCFunction) PyOmniSession_connect, METH_VARARGS | METH_KEYWORDS, NULL},
{"close", (PyCFunction) PyOmniSession_close, METH_NOARGS, NULL}, {"close", (PyCFunction) PyOmniSession_close, METH_NOARGS, NULL},
{"send", (PyCFunction) PyOmniSession_send, METH_VARARGS | METH_KEYWORDS, NULL}, {"send", (PyCFunction) PyOmniSession_send, METH_VARARGS | METH_KEYWORDS, NULL},
{"send_with_id", (PyCFunction) PyOmniSession_send_with_id, METH_VARARGS | METH_KEYWORDS, NULL},
{"recv", (PyCFunction) PyOmniSession_recv, METH_VARARGS | METH_KEYWORDS, PyOmniSession_recv_doc}, {"recv", (PyCFunction) PyOmniSession_recv, METH_VARARGS | METH_KEYWORDS, PyOmniSession_recv_doc},
{"recv_into", (PyCFunction) PyOmniSession_recv_into, METH_VARARGS | METH_KEYWORDS, PyOmniSession_recv_into_doc}, {"recv_into", (PyCFunction) PyOmniSession_recv_into, METH_VARARGS | METH_KEYWORDS, PyOmniSession_recv_into_doc},
{"stats", (PyCFunction) PyOmniSession_stats, METH_NOARGS, NULL}, {"stats", (PyCFunction) PyOmniSession_stats, METH_NOARGS, NULL},

View File

@@ -167,6 +167,16 @@ int omnisocket_session_close(omnisocket_session_t *session) {
} }
int omnisocket_session_send(omnisocket_session_t *session, const char *to, const void *data, size_t data_len) { int omnisocket_session_send(omnisocket_session_t *session, const char *to, const void *data, size_t data_len) {
return omnisocket_session_send_with_id(session, to, data, data_len, NULL);
}
int omnisocket_session_send_with_id(
omnisocket_session_t *session,
const char *to,
const void *data,
size_t data_len,
uint64_t *out_message_id
) {
kcp_client_t *client; kcp_client_t *client;
int rc; int rc;
@@ -178,7 +188,7 @@ int omnisocket_session_send(omnisocket_session_t *session, const char *to, const
if (omnisocket_session_begin_client_op(session, &client) != 0) { if (omnisocket_session_begin_client_op(session, &client) != 0) {
return -1; return -1;
} }
rc = kcp_client_send_binary(client, to, data, data_len); rc = kcp_client_send_binary_with_id(client, to, data, data_len, out_message_id);
pthread_mutex_lock(&session->mutex); pthread_mutex_lock(&session->mutex);
if (rc == 0) { if (rc == 0) {
session->stats.send_calls += 1; session->stats.send_calls += 1;
@@ -297,7 +307,9 @@ void omnisocket_session_kcp_stats_snapshot(omnisocket_session_t *session, omniso
out_stats->conv = runtime_stats.conv; out_stats->conv = runtime_stats.conv;
out_stats->rto_ms = runtime_stats.rto_ms; out_stats->rto_ms = runtime_stats.rto_ms;
out_stats->srtt_ms = runtime_stats.srtt_ms; out_stats->srtt_ms = runtime_stats.srtt_ms;
out_stats->min_srtt_ms = runtime_stats.min_srtt_ms;
out_stats->srttvar_ms = runtime_stats.srttvar_ms; out_stats->srttvar_ms = runtime_stats.srttvar_ms;
out_stats->last_feedback_age_ms = runtime_stats.last_feedback_age_ms;
out_stats->snd_wnd = runtime_stats.snd_wnd; out_stats->snd_wnd = runtime_stats.snd_wnd;
out_stats->rmt_wnd = runtime_stats.rmt_wnd; out_stats->rmt_wnd = runtime_stats.rmt_wnd;
out_stats->inflight = runtime_stats.inflight; out_stats->inflight = runtime_stats.inflight;

View File

@@ -22,7 +22,9 @@ typedef struct omnisocket_session_kcp_stats {
uint32_t conv; uint32_t conv;
uint32_t rto_ms; uint32_t rto_ms;
int32_t srtt_ms; int32_t srtt_ms;
int32_t min_srtt_ms;
int32_t srttvar_ms; int32_t srttvar_ms;
uint32_t last_feedback_age_ms;
uint32_t snd_wnd; uint32_t snd_wnd;
uint32_t rmt_wnd; uint32_t rmt_wnd;
uint32_t inflight; uint32_t inflight;
@@ -72,6 +74,13 @@ int omnisocket_session_connect(
); );
int omnisocket_session_close(omnisocket_session_t *session); int omnisocket_session_close(omnisocket_session_t *session);
int omnisocket_session_send(omnisocket_session_t *session, const char *to, const void *data, size_t data_len); int omnisocket_session_send(omnisocket_session_t *session, const char *to, const void *data, size_t data_len);
int omnisocket_session_send_with_id(
omnisocket_session_t *session,
const char *to,
const void *data,
size_t data_len,
uint64_t *out_message_id
);
int omnisocket_session_recv(omnisocket_session_t *session, message_t *out_msg, int timeout_ms); int omnisocket_session_recv(omnisocket_session_t *session, message_t *out_msg, int timeout_ms);
int omnisocket_session_recv_into( int omnisocket_session_recv_into(
omnisocket_session_t *session, omnisocket_session_t *session,

View File

@@ -24,7 +24,6 @@ setup(
maintainer_email='codex@example.com', maintainer_email='codex@example.com',
description='ROS 2 OmniSocket UDP/KCP bridge for teleop TwistStamped commands.', description='ROS 2 OmniSocket UDP/KCP bridge for teleop TwistStamped commands.',
license='MIT', license='MIT',
tests_require=['pytest'],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'cmd_vel_udp_sender = udp_teleop_bridge.cmd_vel_udp_sender:main', 'cmd_vel_udp_sender = udp_teleop_bridge.cmd_vel_udp_sender:main',

View File

@@ -72,6 +72,12 @@ class OmniTransport:
def send(self, *, to: str, data: bytes) -> None: def send(self, *, to: str, data: bytes) -> None:
self._session.send(to=to, data=data) self._session.send(to=to, data=data)
def send_with_id(self, *, to: str, data: bytes) -> int:
if not hasattr(self._session, 'send_with_id'):
self._session.send(to=to, data=data)
raise RuntimeError('send_with_id is not available on this omnisocket build')
return int(self._session.send_with_id(to=to, data=data))
def recv(self, *, timeout_ms: int = -1): def recv(self, *, timeout_ms: int = -1):
return self._session.recv(timeout_ms=timeout_ms) return self._session.recv(timeout_ms=timeout_ms)

View File

@@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import json
import os import os
import socket import socket
import threading import threading
@@ -90,8 +91,14 @@ class UdpCmdVelReceiver(Node):
self._last_published_command: CommandTuple = ZERO_COMMAND self._last_published_command: CommandTuple = ZERO_COMMAND
self._closing = threading.Event() self._closing = threading.Event()
self._recv_buffer = bytearray(DEFAULT_RECV_BUFFER_BYTES) self._recv_buffer = bytearray(DEFAULT_RECV_BUFFER_BYTES)
self._runtime_dir = os.getenv('BLITZ_RUNTIME_DIR', '/run/blitz-robot').strip() or '/run/blitz-robot'
self._status_path = os.path.join(self._runtime_dir, 'ros-receiver.status.json')
self._transport_reconnect_count = 0
self._recv_thread_heartbeat_epoch_ms = self._now_epoch_ms()
self._runtime_last_error = ''
self.create_timer(1.0 / self._publish_rate_hz, self._publish_tick) self.create_timer(1.0 / self._publish_rate_hz, self._publish_tick)
self.create_timer(1.0, self._write_status_tick)
recv_target = self._recv_loop_unix_dgram if self._transport_name == 'unix_dgram' else self._recv_loop recv_target = self._recv_loop_unix_dgram if self._transport_name == 'unix_dgram' else self._recv_loop
self._recv_thread = threading.Thread(target=recv_target, daemon=True) self._recv_thread = threading.Thread(target=recv_target, daemon=True)
@@ -174,6 +181,8 @@ class UdpCmdVelReceiver(Node):
pass pass
try: try:
self._transport = self._create_transport() self._transport = self._create_transport()
self._transport_reconnect_count += 1
self._set_runtime_last_error('')
if self._should_log('transport_reconnected', 1.0): if self._should_log('transport_reconnected', 1.0):
self.get_logger().info( self.get_logger().info(
'Reconnected OmniSocket transport %s://%s as %s' 'Reconnected OmniSocket transport %s://%s as %s'
@@ -182,6 +191,7 @@ class UdpCmdVelReceiver(Node):
return True return True
except OSError as exc: except OSError as exc:
self._transport = None self._transport = None
self._set_runtime_last_error(str(exc))
if self._should_log('transport_reconnect_error', 2.0): if self._should_log('transport_reconnect_error', 2.0):
self.get_logger().error(f'Failed to reconnect OmniSocket transport: {exc}') self.get_logger().error(f'Failed to reconnect OmniSocket transport: {exc}')
time.sleep(0.5) time.sleep(0.5)
@@ -192,10 +202,13 @@ class UdpCmdVelReceiver(Node):
self._close_unix_socket() self._close_unix_socket()
try: try:
self._setup_unix_socket() self._setup_unix_socket()
self._transport_reconnect_count += 1
self._set_runtime_last_error('')
if self._should_log('unix_rebound', 1.0): if self._should_log('unix_rebound', 1.0):
self.get_logger().info(f'Rebound unix datagram socket at {self._local_socket_path}') self.get_logger().info(f'Rebound unix datagram socket at {self._local_socket_path}')
return True return True
except OSError as exc: except OSError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('unix_rebind_error', 2.0): if self._should_log('unix_rebind_error', 2.0):
self.get_logger().error(f'Failed to rebind unix datagram socket: {exc}') self.get_logger().error(f'Failed to rebind unix datagram socket: {exc}')
time.sleep(0.5) time.sleep(0.5)
@@ -209,6 +222,61 @@ class UdpCmdVelReceiver(Node):
return True return True
return False return False
def _now_epoch_ms(self) -> int:
return time.time_ns() // 1_000_000
def _update_recv_heartbeat(self) -> None:
with self._lock:
self._recv_thread_heartbeat_epoch_ms = self._now_epoch_ms()
def _last_packet_age_ms(self) -> int | None:
with self._lock:
last_packet_monotonic = self._last_packet_monotonic
if last_packet_monotonic is None:
return None
return max(0, int((time.monotonic() - last_packet_monotonic) * 1000.0))
def _socket_bound(self) -> bool:
if self._transport_name == 'unix_dgram':
return self._unix_socket is not None and os.path.exists(self._local_socket_path)
return self._transport is not None
def _set_runtime_last_error(self, message: str) -> None:
self._runtime_last_error = message
def _status_payload(self) -> dict[str, object]:
with self._lock:
recv_thread_heartbeat_epoch_ms = self._recv_thread_heartbeat_epoch_ms
return {
'updated_at_epoch_ms': self._now_epoch_ms(),
'pid': os.getpid(),
'recv_thread_heartbeat_epoch_ms': recv_thread_heartbeat_epoch_ms,
'transport': self._transport_name,
'local_socket_path': self._local_socket_path,
'socket_bound': self._socket_bound(),
'transport_reconnect_count': self._transport_reconnect_count,
'last_packet_age_ms': self._last_packet_age_ms(),
'last_error': self._runtime_last_error,
}
def _write_status_tick(self) -> None:
payload = self._status_payload()
if self._transport_name == 'unix_dgram':
if self._unix_socket is None:
payload['last_error'] = self._runtime_last_error or 'unix datagram socket is not bound'
else:
if self._transport is None:
payload['last_error'] = self._runtime_last_error or 'OmniSocket transport is not connected'
try:
os.makedirs(self._runtime_dir, exist_ok=True)
temp_path = f'{self._status_path}.tmp.{os.getpid()}'
with open(temp_path, 'w', encoding='utf-8') as handle:
json.dump(payload, handle, ensure_ascii=True, separators=(',', ':'))
os.replace(temp_path, self._status_path)
except OSError as exc:
if self._should_log('status_write_error', 5.0):
self.get_logger().warning(f'Failed to write receiver status file: {exc}')
def _publish_command(self, command: CommandTuple) -> None: def _publish_command(self, command: CommandTuple) -> None:
msg = TwistStamped() msg = TwistStamped()
msg.header.stamp = self.get_clock().now().to_msg() msg.header.stamp = self.get_clock().now().to_msg()
@@ -229,32 +297,39 @@ class UdpCmdVelReceiver(Node):
def _recv_loop(self) -> None: def _recv_loop(self) -> None:
while not self._closing.is_set() and rclpy.ok(): while not self._closing.is_set() and rclpy.ok():
self._update_recv_heartbeat()
try: try:
assert self._transport is not None assert self._transport is not None
meta = self._transport.recv_into(buffer=self._recv_buffer, timeout_ms=100) meta = self._transport.recv_into(buffer=self._recv_buffer, timeout_ms=100)
except BufferError as exc: except BufferError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('buffer_error', 2.0): if self._should_log('buffer_error', 2.0):
self.get_logger().warning(f'Dropped oversized OmniSocket frame: {exc}') self.get_logger().warning(f'Dropped oversized OmniSocket frame: {exc}')
continue continue
except OSError as exc: except OSError as exc:
self._set_runtime_last_error(str(exc))
if not self._closing.is_set() and self._should_log('recv_error', 2.0): if not self._closing.is_set() and self._should_log('recv_error', 2.0):
self.get_logger().error(f'OmniSocket receive loop stopped: {exc}') self.get_logger().error(f'OmniSocket receive loop stopped: {exc}')
if not self._reconnect_transport(): if not self._reconnect_transport():
return return
continue continue
self._update_recv_heartbeat()
if meta is None: if meta is None:
continue continue
self._set_runtime_last_error('')
from_peer = str(meta['from']) from_peer = str(meta['from'])
msg_type = int(meta['msg_type']) msg_type = int(meta['msg_type'])
body_len = int(meta['body_len']) body_len = int(meta['body_len'])
if msg_type == self._msg_type_error: if msg_type == self._msg_type_error:
self._set_runtime_last_error(f'server error message from {from_peer}')
self._handle_error_message(from_peer, body_len) self._handle_error_message(from_peer, body_len)
continue continue
if self._expected_sender and from_peer != self._expected_sender: if self._expected_sender and from_peer != self._expected_sender:
self._set_runtime_last_error(f'unexpected sender {from_peer}')
if self._should_log('unexpected_sender', 2.0): if self._should_log('unexpected_sender', 2.0):
self.get_logger().warning( self.get_logger().warning(
'Ignoring message from unexpected sender %s (expected %s)' 'Ignoring message from unexpected sender %s (expected %s)'
@@ -263,6 +338,7 @@ class UdpCmdVelReceiver(Node):
continue continue
if msg_type != self._msg_type_binary: if msg_type != self._msg_type_binary:
self._set_runtime_last_error(f'unexpected message type {msg_type}')
if self._should_log('unexpected_type', 2.0): if self._should_log('unexpected_type', 2.0):
self.get_logger().warning( self.get_logger().warning(
'Ignoring unexpected message type %d from %s (%d bytes)' 'Ignoring unexpected message type %d from %s (%d bytes)'
@@ -271,6 +347,7 @@ class UdpCmdVelReceiver(Node):
continue continue
if body_len != PACKET_SIZE: if body_len != PACKET_SIZE:
self._set_runtime_last_error(f'invalid payload size {body_len}')
if self._should_log('packet_size', 2.0): if self._should_log('packet_size', 2.0):
self.get_logger().warning( self.get_logger().warning(
'Dropped binary payload from %s with invalid size %d (expected %d)' 'Dropped binary payload from %s with invalid size %d (expected %d)'
@@ -281,6 +358,7 @@ class UdpCmdVelReceiver(Node):
try: try:
command = unpack_command(self._recv_buffer[:PACKET_SIZE]) command = unpack_command(self._recv_buffer[:PACKET_SIZE])
except ValueError as exc: except ValueError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('decode_error', 2.0): if self._should_log('decode_error', 2.0):
self.get_logger().warning(f'Dropped malformed command payload: {exc}') self.get_logger().warning(f'Dropped malformed command payload: {exc}')
continue continue
@@ -288,15 +366,18 @@ class UdpCmdVelReceiver(Node):
with self._lock: with self._lock:
self._latest_command = command self._latest_command = command
self._last_packet_monotonic = time.monotonic() self._last_packet_monotonic = time.monotonic()
self._set_runtime_last_error('')
def _recv_loop_unix_dgram(self) -> None: def _recv_loop_unix_dgram(self) -> None:
assert self._unix_socket is not None assert self._unix_socket is not None
while not self._closing.is_set() and rclpy.ok(): while not self._closing.is_set() and rclpy.ok():
self._update_recv_heartbeat()
try: try:
payload = self._unix_socket.recv(DEFAULT_RECV_BUFFER_BYTES) payload = self._unix_socket.recv(DEFAULT_RECV_BUFFER_BYTES)
except socket.timeout: except socket.timeout:
if not os.path.exists(self._local_socket_path): if not os.path.exists(self._local_socket_path):
self._set_runtime_last_error('unix datagram socket path disappeared')
if self._should_log('unix_socket_missing', 2.0): if self._should_log('unix_socket_missing', 2.0):
self.get_logger().warning( self.get_logger().warning(
f'Unix datagram socket path disappeared, rebinding {self._local_socket_path}' f'Unix datagram socket path disappeared, rebinding {self._local_socket_path}'
@@ -305,13 +386,16 @@ class UdpCmdVelReceiver(Node):
return return
continue continue
except OSError as exc: except OSError as exc:
self._set_runtime_last_error(str(exc))
if not self._closing.is_set() and self._should_log('unix_recv_error', 2.0): if not self._closing.is_set() and self._should_log('unix_recv_error', 2.0):
self.get_logger().error(f'Unix datagram receive loop stopped: {exc}') self.get_logger().error(f'Unix datagram receive loop stopped: {exc}')
if not self._rebind_unix_socket(): if not self._rebind_unix_socket():
return return
continue continue
self._update_recv_heartbeat()
if len(payload) != PACKET_SIZE: if len(payload) != PACKET_SIZE:
self._set_runtime_last_error(f'invalid unix datagram payload size {len(payload)}')
if self._should_log('unix_packet_size', 2.0): if self._should_log('unix_packet_size', 2.0):
self.get_logger().warning( self.get_logger().warning(
'Dropped unix datagram payload with invalid size %d (expected %d)' 'Dropped unix datagram payload with invalid size %d (expected %d)'
@@ -322,6 +406,7 @@ class UdpCmdVelReceiver(Node):
try: try:
command = unpack_command(payload) command = unpack_command(payload)
except ValueError as exc: except ValueError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('unix_decode_error', 2.0): if self._should_log('unix_decode_error', 2.0):
self.get_logger().warning(f'Dropped malformed unix datagram payload: {exc}') self.get_logger().warning(f'Dropped malformed unix datagram payload: {exc}')
continue continue
@@ -329,6 +414,7 @@ class UdpCmdVelReceiver(Node):
with self._lock: with self._lock:
self._latest_command = command self._latest_command = command
self._last_packet_monotonic = time.monotonic() self._last_packet_monotonic = time.monotonic()
self._set_runtime_last_error('')
def _command_for_publish_tick(self) -> tuple[CommandTuple, Optional[float], bool]: def _command_for_publish_tick(self) -> tuple[CommandTuple, Optional[float], bool]:
with self._lock: with self._lock:

View File

@@ -1,385 +1,219 @@
# 机器人 B 端开机自启说明 # Robot B-Side Boot Chain
这个目录是给机器人端做开机自启用的。 This directory contains the robot-side boot and recovery scripts.
你看到这里多了不少脚本和 `systemd` 单元,不是为了让你手工一条条执行,而是为了把开机流程拆开管理: Normal usage is:
1. 固定启动顺序
2. 某一步失败时可单独重试
3. 所有动作统一写到一个本地日志文件
4. 后面如果要把“固定延时 30 秒”换成“等待机器人原有自检完成”,只改最前面的闸门即可
所以平时真正需要人工执行的,通常只有这两步:
```bash ```bash
sudo bash scripts/boot/install-systemd.sh sudo bash scripts/boot/install-systemd.sh
sudo systemctl start blitz-robot.target sudo systemctl start blitz-robot.target
``` ```
以后机器人重启时,就不需要你再手工执行这些脚本了。 After installation, `blitz-robot.target` is enabled and will start automatically on reboot.
## 启动顺序 To stop the chain now and disable boot-time autostart for future reboots:
当前开机链路如下: ```bash
sudo bash scripts/boot/disable-systemd.sh
```
## Current Startup Order
The current cold-start chain is:
1. `blitz-boot-gate.service` 1. `blitz-boot-gate.service`
2. `blitz-5g-dial.service` 2. `blitz-5g-dial.service`
3. `blitz-time-sync.service` 3. `blitz-ros-receiver.service`
4. `blitz-ros-receiver.service` 4. `blitz-b-side-omnid.service`
5. `blitz-b-side-omnid.service` 5. `blitz-watchdog.service`
对应业务顺序就是: There is no longer any automatic time-sync step in the boot chain.
1. 先固定等待 30 秒,给机器人原有自检/自启程序让路 ## What Each Script Does
2. 运行 5G 自动拨号
3. 运行时钟同步
4. 启动 `start-ros-receiver.sh`
5. 启动 `start-b-side-omnid.sh`
## 日志文件 - `robot-boot.env`: default boot configuration
- `robot-boot.env.local`: machine-local overrides
- `common.sh`: shared env loading, logging, and helper functions
- `boot-gate.sh`: fixed startup delay gate
- `5g-dial.sh`: brings up the 5G modem path and verifies routing
- `start-ros-receiver-service.sh`: boot wrapper for ROS receiver
- `wait-for-unix-socket.sh`: waits for the ROS receiver unix socket
- `start-b-side-omnid-service.sh`: boot wrapper for `b_side_omnid`
- `blitz-watchdog.sh`: runtime health watchdog and recovery orchestrator
- `blitz-fault-inject.sh`: fault injection entrypoint
- `install-systemd.sh`: installs systemd units into `/etc/systemd/system`
- `disable-systemd.sh`: stops the boot chain and disables autostart
所有关键操作都会统一写到这个本地文件: ## Important Configuration
```text Most machine-specific overrides should go into:
/var/log/blitz-robot/startup.log
```
每一行日志格式如下:
```text
timestamp | step | action | result | details | exit_code
```
日志里会记录:
- 做了什么
- 实际执行了什么命令
- 前置检查是否通过
- 成功还是失败
- 失败原因
- 退出码
- 是否发生了重试
## 这些文件分别是干什么的
- `robot-boot.env`:开机自启默认配置
- `robot-boot.env.local`:本机覆盖配置,建议把你自己的配置写这里
- `common.sh`:公共环境加载和统一日志函数
- `boot-gate.sh`:启动闸门,当前逻辑是固定等待 30 秒
- `5g-dial.sh`:等待 5G 串口出现,执行 `rndis_dial.py`,删除 5G 默认路由并补齐目标主机路由,然后检查路由是否真的起来
- `time-sync.sh`:把 `chrony` 指向白名单服务器 IP 和端口,并执行一次同步
- `start-ros-receiver-service.sh`:开机版 ROS receiver 启动包装
- `wait-for-unix-socket.sh`:等待 ROS receiver 建好本地 unix socket
- `start-b-side-omnid-service.sh`:开机版 `b_side_omnid` 启动包装
- `install-systemd.sh`:把 `systemd` 单元安装到 `/etc/systemd/system`
- `systemd/*.service.in``systemd/*.target.in``systemd` 模板文件
## 前置条件
你前面说过,除了时钟同步以外,其他程序环境都应该已经配好了。按这个前提,这里只强调必须确认的前置条件。
### 1. 机器人侧必须已有的条件
默认认为下面这些已经具备:
- 系统是 Ubuntu且使用 `systemd`
- `OmniSocketGo` 仓库已经放在机器人上
- `scripts/dev/start-ros-receiver.sh` 原本就能正常启动
- `scripts/dev/start-b-side-omnid.sh` 原本就能正常启动
- `bin/b_side_omnid` 已经提前编译好
- 5G 拨号脚本存在:`/home/nvidia/5g-test/5G/rndis_dial.py`
- 5G 串口设备是:`/dev/ttyUSB7`
注意:
- 开机模式下不会自动编译 `b_side_omnid`
- 如果 `bin/b_side_omnid` 不存在,服务会直接报错并写日志
### 2. 时钟同步需要的前置安装
时钟同步这一步依赖 `chrony`
如果机器人侧没有安装,请先安装:
```bash
sudo apt update
sudo apt install -y chrony
```
安装后建议确认:
```bash
systemctl status chrony
chronyc tracking
```
### 3. 云服务器侧需要的前置条件
因为你的 5G 是白名单网络,所以时钟同步不能依赖公网域名或默认 NTP 池,必须只用你的白名单云服务器 IP。
云服务器侧需要满足:
- 服务器上运行 `chronyd`
- 安全组 / 防火墙放通你实际使用的 UDP 端口
- 机器人能访问这台服务器的 IP
如果云服务器还没有安装 `chrony`,可以参考:
```bash
sudo apt update
sudo apt install -y chrony
sudo systemctl enable chrony
sudo systemctl restart chrony
```
如果你不能使用标准的 `123/udp`,完全可以改成你自己的端口,例如 `10910/udp`
例如云服务器 /etc/chrony/chrony.conf 里改成监听 10910
```conf
port 10910
allow 0/0
```
然后重启:
```bash
sudo systemctl restart chrony
```
机器人端则在 `robot-boot.env.local` 里配置:
```bash
BLITZ_TIME_SERVER_IP="你的云服务器IP"
BLITZ_TIME_SERVER_PORT="10910"
```
这样 `time-sync.sh` 会自动生成:
```conf
server 你的云服务器IP port 10910 iburst
```
注意:这里必须是你自己可控的 `chronyd` 服务端。公网标准 NTP 服务通常只监听 `123/udp`,不能要求它们改到 `10910`
## 需要改哪些配置
不要直接改 `robot-boot.env`,更推荐新建:
```text ```text
scripts/boot/robot-boot.env.local scripts/boot/robot-boot.env.local
``` ```
常见要改的是这些: Typical settings:
```bash ```bash
BLITZ_BOOT_DELAY_SEC="30" BLITZ_BOOT_DELAY_SEC="30"
BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log" BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log"
BLITZ_RUNTIME_DIR="/run/blitz-robot"
BLITZ_5G_DIAL_DIR="/home/nvidia/5g-test/5G" BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot"
BLITZ_5G_SERIAL_PORT="/dev/ttyUSB7" BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2"
BLITZ_5G_INTERFACE=""
BLITZ_5G_MODEM_SUBNET="192.168.224.0/22"
BLITZ_5G_GATEWAY="192.168.225.1" BLITZ_5G_GATEWAY="192.168.225.1"
BLITZ_5G_REMOVE_DEFAULT_ROUTE="1" BLITZ_5G_REMOVE_DEFAULT_ROUTE="1"
BLITZ_5G_ROUTE_TARGETS="106.55.173.235" BLITZ_5G_ROUTE_TARGETS="106.55.173.235"
BLITZ_5G_INFO_JSON="${OMNISOCKETGO_ROOT}/scripts/boot/modem_network_info.json"
BLITZ_TIME_SERVER_IP="你的白名单云服务器IP" BLITZ_TIME_SERVER_IP="81.70.156.140"
BLITZ_TIME_SERVER_PORT="10910"
BLITZ_ROS_USER="nvidia" BLITZ_ROS_USER="nvidia"
BLITZ_ROS_SOCKET_WAIT_SEC="20"
BLITZ_WATCHDOG_INTERVAL_SEC="5"
BLITZ_HEALTH_STALE_SEC="15"
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
BLITZ_NETWORK_FAIL_THRESHOLD="3"
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
BLITZ_GPS_MONITOR_ENABLED="1"
BLITZ_GPS_DEVICE_GLOB="/dev/ttyCH341USB*"
BLITZ_GPS_CHECK_INTERVAL_SEC="10"
BLITZ_GPS_RESTART_UNITS="gpsd.socket gpsd.service"
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
``` ```
如果 `BLITZ_TIME_SERVER_IP` 留空,脚本会自动回退到 `ROBOT_SIDE_OMNISOCKET_SERVER_ADDR` 的 IP 部分。 `BLITZ_TIME_SERVER_IP` is still used, but only as the 5G route/ping health-check target. It is no longer used for automatic clock synchronization.
`BLITZ_5G_REMOVE_DEFAULT_ROUTE="1"` 时,脚本会在 5G 拨号完成后删除该接口上的默认路由,避免整机默认出口切到 5G。此时 `BLITZ_TIME_SERVER_IP``BLITZ_5G_ROUTE_TARGETS` 中的目标 IP 会显式走 5G其它流量继续走有线或 Wi-Fi 的默认路由。 If `BLITZ_TIME_SERVER_IP` is left empty, the scripts fall back to the host part of `ROBOT_SIDE_OMNISOCKET_SERVER_ADDR`.
## 如何安装和使用 ## Install Or Upgrade
下面假设你当前目录就在 `OmniSocketGo` 仓库根目录。 Run:
### 第一步:准备本机配置
建议先创建:
```bash
cp scripts/boot/robot-boot.env scripts/boot/robot-boot.env.local
```
然后编辑:
```bash
vim scripts/boot/robot-boot.env.local
```
至少确认这几个值是对的:
- `BLITZ_5G_DIAL_DIR`
- `BLITZ_5G_SERIAL_PORT`
- `BLITZ_TIME_SERVER_IP`
- `BLITZ_TIME_SERVER_PORT`
- `BLITZ_ROS_USER`
### 第二步:安装 systemd 单元
执行:
```bash ```bash
sudo bash scripts/boot/install-systemd.sh sudo bash scripts/boot/install-systemd.sh
sudo systemctl daemon-reload
sudo systemctl restart blitz-robot.target
``` ```
这个安装脚本会做这些事情: `install-systemd.sh` will also remove any old `blitz-time-sync.service` unit left over from earlier versions.
1. 创建日志目录和日志文件 ## Disable Autostart
2. 渲染 `systemd` 模板
3. 把 unit 文件复制到 `/etc/systemd/system`
4. 执行 `systemctl daemon-reload`
5. 执行 `systemctl enable blitz-robot.target`
### 第三步:立刻启动一次 To stop the currently running services and disable autostart for future reboots:
执行:
```bash ```bash
sudo bash scripts/boot/disable-systemd.sh
```
To re-enable later:
```bash
sudo bash scripts/boot/install-systemd.sh
sudo systemctl start blitz-robot.target sudo systemctl start blitz-robot.target
``` ```
### 第四步:以后重启自动生效 ## Logs
因为安装脚本已经做了 `enable`,所以后续机器人重启时会自动拉起,不需要你再手工执行。 All boot-chain and watchdog logs are appended to:
如果想手工确认,也可以执行:
```bash
sudo systemctl enable blitz-robot.target
```
## 如何查看是否正常
### 看总日志文件
最直接:
```bash
tail -f /var/log/blitz-robot/startup.log
```
### 看各个服务状态
```bash
systemctl status blitz-robot.target
systemctl status blitz-boot-gate.service
systemctl status blitz-5g-dial.service
systemctl status blitz-time-sync.service
systemctl status blitz-ros-receiver.service
systemctl status blitz-b-side-omnid.service
```
### 看 journal
```bash
journalctl -u blitz-robot.target -u blitz-boot-gate.service -u blitz-5g-dial.service \
-u blitz-time-sync.service -u blitz-ros-receiver.service \
-u blitz-b-side-omnid.service -f
```
## 当前时钟同步会做什么
`time-sync.sh` 当前逻辑是:
1. 读取 `BLITZ_TIME_SERVER_IP`
2. 读取 `BLITZ_TIME_SERVER_PORT`
3. 修改 `/etc/chrony/chrony.conf`
4. 注释掉原有的 `pool``server`
5. 保留一个备份文件:`/etc/chrony/chrony.conf.blitz-bak`
6. 写入:
```text ```text
/etc/chrony/sources.d/blitz-robot.sources /var/log/blitz-robot/startup.log
``` ```
7. 生成类似下面这一行: Follow the log live:
```conf
server 你的云服务器IP port 10910 iburst
```
8. 重启 `chrony`
9. 执行 `chronyc burst`
10. 执行 `chronyc waitsync`
注意:
- 如果同步超时,会记日志为 `soft_fail`
- 但不会阻塞后面的 ROS 和 `b_side_omnid` 启动
## 常见问题
### 1. 为什么会突然多出这么多脚本?
因为把开机流程拆成了多个稳定的小步骤:
- 更容易排查哪一步失败
- 更容易让 `systemd` 自动重启
- 更容易记录完整日志
- 后续更容易替换“30 秒延时”为真正的机器人 ready 条件
你平时不需要手工逐个执行这些脚本。
### 2. 我是不是要手工跑 `5g-dial.sh`、`time-sync.sh`、`start-ros-receiver-service.sh`
正常情况下不用。
你只需要:
```bash ```bash
sudo bash scripts/boot/install-systemd.sh sudo tail -f /var/log/blitz-robot/startup.log
sudo systemctl start blitz-robot.target
``` ```
### 3. 如果时钟同步失败怎么办? Check service state:
先看:
```bash ```bash
tail -f /var/log/blitz-robot/startup.log sudo systemctl status blitz-robot.target
systemctl status blitz-time-sync.service sudo systemctl status blitz-5g-dial.service
chronyc sources -v sudo systemctl status blitz-ros-receiver.service
chronyc tracking sudo systemctl status blitz-b-side-omnid.service
sudo systemctl status blitz-watchdog.service
``` ```
优先检查: Check systemd journal:
- `BLITZ_TIME_SERVER_IP` 是否填对
- `BLITZ_TIME_SERVER_PORT` 是否填对
- 云服务器是否真的跑了 `chronyd`
- 云服务器防火墙 / 安全组是否放通你配置的 UDP 端口,例如 `10910`
- 5G 白名单是否确实允许访问这个服务器 IP
### 4. 如果 ROS receiver 没起来怎么办?
先看:
```bash ```bash
systemctl status blitz-ros-receiver.service sudo journalctl -u blitz-robot.target -u blitz-5g-dial.service \
tail -f /var/log/blitz-robot/startup.log -u blitz-ros-receiver.service -u blitz-b-side-omnid.service \
-u blitz-watchdog.service -f
``` ```
再检查: ## Runtime Status Files
- `/opt/ros/${ROS_DISTRO}/setup.bash` 是否存在 The runtime status directory is:
- `${ROS_CONTROL_PY_DIR}/install/setup.bash` 是否存在
- `ROBOT_RECEIVER_LOCAL_SOCKET_PATH` 对应的 socket 是否出现
### 5. 如果 b_side_omnid 没起来怎么办? ```text
/run/blitz-robot
```
先看: Key files:
- `b-side-omnid.status.json`
- `ros-receiver.status.json`
- `watchdog.status.json`
`watchdog.status.json` now also records `gps_ok` and `gps_device_present` so you can quickly tell whether the GPS USB serial node is currently visible and whether the last `gpsd` reconnect attempt succeeded.
Pretty-print them:
```bash ```bash
systemctl status blitz-b-side-omnid.service sudo python3 -m json.tool /run/blitz-robot/watchdog.status.json
tail -f /var/log/blitz-robot/startup.log sudo python3 -m json.tool /run/blitz-robot/b-side-omnid.status.json
sudo python3 -m json.tool /run/blitz-robot/ros-receiver.status.json
``` ```
再检查: ## Fault Injection
- `bin/b_side_omnid` 是否已经提前编译好 Available test commands:
- 摄像头设备是否存在
- `robot-remote.env` / `robot-boot.env.local` 里的地址配置是否正确 ```bash
sudo bash scripts/boot/blitz-fault-inject.sh bside-crash
sudo bash scripts/boot/blitz-fault-inject.sh bside-process-freeze
sudo bash scripts/boot/blitz-fault-inject.sh bside-video-thread-stall
sudo bash scripts/boot/blitz-fault-inject.sh bside-control-thread-stall
sudo bash scripts/boot/blitz-fault-inject.sh ros-crash
sudo bash scripts/boot/blitz-fault-inject.sh ros-freeze
```
For synthetic network fault injection, first enable it in `robot-boot.env.local`:
```bash
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="1"
```
Then restart watchdog and inject:
```bash
sudo systemctl restart blitz-watchdog.service
sudo bash scripts/boot/blitz-fault-inject.sh network-down on
sudo bash scripts/boot/blitz-fault-inject.sh network-down off
```
## Recovery Behavior Summary
- If `b_side_omnid` dies or its status file goes stale, watchdog first tries a targeted `b_side` restart.
- If ROS receiver dies, loses its socket, or its heartbeat goes stale, watchdog performs an ordered full restart:
- stop `b_side`
- restart ROS receiver
- wait for unix socket
- start `b_side`
- If network checks fail repeatedly, watchdog stops `b_side`, runs `5g-dial.sh`, waits for route recovery, and then restores services.
- While 5G is healthy, watchdog keeps every host route listed by `BLITZ_TIME_SERVER_IP` and `BLITZ_5G_ROUTE_TARGETS` pinned to the resolved 5G interface. When 5G becomes unhealthy, watchdog deletes those host routes so traffic can fall back to the remaining default network path. If that fallback path is still reachable, watchdog keeps `b_side_omnid` running instead of treating it as a full network outage.
- Whenever watchdog changes or restores those host routes, it logs `route-path` lines for each target so you can see which interface Linux currently chooses for `81.70.156.140`, `106.55.173.235`, and any other configured 5G-pinned target.
- If GPS monitoring is enabled, watchdog checks `BLITZ_GPS_DEVICE_GLOB` every `BLITZ_GPS_CHECK_INTERVAL_SEC` seconds. When the GPS serial device disappears and later reappears, watchdog restarts the units in `BLITZ_GPS_RESTART_UNITS` so `gpsd` can bind to the new device node again.
- Camera disappearance is logged as degraded state. Reappearance triggers a `b_side` restart after the device is stable.
## Notes
- `time-sync.sh` and `blitz-time-sync.service` are intentionally removed from the automatic boot path.
- `b_side_omnid` must already be built before boot-time startup.
- `bin/b_side_omnid` missing, ROS env missing, or modem script missing will all show up in `startup.log`.

View File

@@ -0,0 +1,137 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="5g-link-logger"
resolve_target_ip() {
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
return 0
fi
for candidate in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
if [[ -n "${candidate}" ]]; then
printf '%s\n' "${candidate}"
return 0
fi
done
return 1
}
emit_sample_json() {
local interface_name="${1:-}"
local target_ip="${2:-}"
python3 - "${interface_name}" "${target_ip}" <<'PY'
import json
import subprocess
import sys
import time
interface_name = sys.argv[1]
target_ip = sys.argv[2]
payload = {
"ts_unix_ms": time.time_ns() // 1_000_000,
"interface": interface_name,
"target_ip": target_ip,
"link_present": False,
"route_output": "",
"route_ok": False,
"probe_ok": False,
"ping_rtt_ms": None,
"rx_bytes": 0,
"tx_bytes": 0,
"rx_packets": 0,
"tx_packets": 0,
"rx_errors": 0,
"tx_errors": 0,
"rx_drops": 0,
"tx_drops": 0,
}
if interface_name:
try:
output = subprocess.check_output(
["ip", "-j", "-s", "link", "show", "dev", interface_name],
text=True,
stderr=subprocess.DEVNULL,
)
stats = json.loads(output)
if stats:
item = stats[0]
payload["link_present"] = True
rx = item.get("stats64", {}).get("rx", {})
tx = item.get("stats64", {}).get("tx", {})
if not rx and not tx:
rx = item.get("stats", {}).get("rx", {})
tx = item.get("stats", {}).get("tx", {})
payload["rx_bytes"] = int(rx.get("bytes") or 0)
payload["tx_bytes"] = int(tx.get("bytes") or 0)
payload["rx_packets"] = int(rx.get("packets") or 0)
payload["tx_packets"] = int(tx.get("packets") or 0)
payload["rx_errors"] = int(rx.get("errors") or 0)
payload["tx_errors"] = int(tx.get("errors") or 0)
payload["rx_drops"] = int(rx.get("dropped") or 0)
payload["tx_drops"] = int(tx.get("dropped") or 0)
except Exception:
pass
if target_ip:
try:
route = subprocess.check_output(
["ip", "route", "get", target_ip],
text=True,
stderr=subprocess.STDOUT,
).strip()
payload["route_output"] = route.splitlines()[0] if route else ""
payload["route_ok"] = bool(payload["route_output"]) and (
not interface_name or f" dev {interface_name}" in payload["route_output"]
)
except Exception as exc:
payload["route_output"] = str(exc)
ping_cmd = ["ping", "-c", "1", "-W", "2", target_ip]
if interface_name:
ping_cmd[1:1] = ["-I", interface_name]
ping = subprocess.run(ping_cmd, capture_output=True, text=True)
payload["probe_ok"] = ping.returncode == 0
output = (ping.stdout or "") + "\n" + (ping.stderr or "")
for token in output.replace("\n", " ").split():
if token.startswith("time="):
value = token.split("=", 1)[1].rstrip("ms")
try:
payload["ping_rtt_ms"] = float(value)
except ValueError:
pass
break
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
PY
}
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
blitz_load_boot_env
blitz_require_run_context
fi
if [[ -z "${BLITZ_RUN_DIR:-}" && -f "${BLITZ_RUN_CONTEXT_FILE:-}" ]]; then
blitz_load_run_context_env || true
fi
blitz_ensure_instance_id
export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-${BLITZ_RUN_DIR}/b-5g-link-quality.${BLITZ_INSTANCE_ID}.jsonl}"
target_ip="$(resolve_target_ip || true)"
blitz_log "${STEP}" "start" "start" "path=${BLITZ_5G_LINK_LOG_PATH} interval_sec=${BLITZ_5G_LINK_LOG_INTERVAL_SEC}" 0
while true; do
interface_name="$(blitz_resolve_5g_interface || true)"
line="$(emit_sample_json "${interface_name}" "${target_ip}")"
blitz_jsonl_append_line "${BLITZ_5G_LINK_LOG_PATH}" "${line}"
sleep "${BLITZ_5G_LINK_LOG_INTERVAL_SEC}"
done

View File

@@ -0,0 +1,139 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="fault-inject"
B_SIDE_SERVICE="blitz-b-side-omnid.service"
ROS_SERVICE="blitz-ros-receiver.service"
main_pid_for_service() {
local service_name="$1"
systemctl show --property MainPID --value "${service_name}"
}
wait_for_service_pid_change() {
local service_name="$1"
local previous_pid="$2"
local timeout_sec="${3:-10}"
local waited=0
local current_pid=""
while (( waited < timeout_sec )); do
current_pid="$(main_pid_for_service "${service_name}")"
if [[ -n "${current_pid}" && "${current_pid}" != "0" && "${current_pid}" != "${previous_pid}" ]]; then
printf '%s\n' "${current_pid}"
return 0
fi
sleep 1
waited=$(( waited + 1 ))
done
return 1
}
require_running_pid() {
local service_name="$1"
local pid
pid="$(main_pid_for_service "${service_name}")"
if [[ -z "${pid}" || "${pid}" == "0" ]]; then
blitz_log "${STEP}" "lookup-pid" "failure" "service=${service_name}" 1
exit 1
fi
printf '%s\n' "${pid}"
}
write_fault_flag() {
local flag_name="$1"
local flag_path="${BLITZ_RUNTIME_DIR}/${flag_name}"
printf '%s\n' "$(date +%s)" > "${flag_path}"
blitz_log "${STEP}" "flag-on" "success" "path=${flag_path}" 0
}
clear_fault_flag() {
local flag_name="$1"
local flag_path="${BLITZ_RUNTIME_DIR}/${flag_name}"
rm -f "${flag_path}"
blitz_log "${STEP}" "flag-off" "success" "path=${flag_path}" 0
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_prepare_runtime_dir
case "${1:-}" in
bside-crash)
target_pid="$(require_running_pid "${B_SIDE_SERVICE}")"
blitz_log "${STEP}" "bside-crash" "start" "service=${B_SIDE_SERVICE} pid=${target_pid}" 0
kill -9 "${target_pid}"
if restarted_pid="$(wait_for_service_pid_change "${B_SIDE_SERVICE}" "${target_pid}")"; then
blitz_log "${STEP}" "bside-crash" "success" "old_pid=${target_pid} new_pid=${restarted_pid}" 0
else
blitz_log "${STEP}" "bside-crash" "failure" "old_pid=${target_pid} restart_not_observed_within=10s" 1
exit 1
fi
;;
bside-process-freeze)
target_pid="$(require_running_pid "${B_SIDE_SERVICE}")"
blitz_log "${STEP}" "bside-process-freeze" "start" "service=${B_SIDE_SERVICE} pid=${target_pid}" 0
kill -STOP "${target_pid}"
blitz_log "${STEP}" "bside-process-freeze" "success" "service=${B_SIDE_SERVICE} pid=${target_pid}" 0
;;
bside-video-thread-stall)
write_fault_flag "fault-injection-bside-video-thread-stall"
;;
bside-control-thread-stall)
write_fault_flag "fault-injection-bside-control-thread-stall"
;;
ros-crash)
target_pid="$(require_running_pid "${ROS_SERVICE}")"
blitz_log "${STEP}" "ros-crash" "start" "service=${ROS_SERVICE} pid=${target_pid}" 0
kill -9 "${target_pid}"
if restarted_pid="$(wait_for_service_pid_change "${ROS_SERVICE}" "${target_pid}")"; then
blitz_log "${STEP}" "ros-crash" "success" "old_pid=${target_pid} new_pid=${restarted_pid}" 0
else
blitz_log "${STEP}" "ros-crash" "failure" "old_pid=${target_pid} restart_not_observed_within=10s" 1
exit 1
fi
;;
ros-freeze)
target_pid="$(require_running_pid "${ROS_SERVICE}")"
blitz_log "${STEP}" "ros-freeze" "start" "service=${ROS_SERVICE} pid=${target_pid}" 0
kill -STOP "${target_pid}"
blitz_log "${STEP}" "ros-freeze" "success" "service=${ROS_SERVICE} pid=${target_pid}" 0
;;
network-down)
if [[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" != "1" ]]; then
blitz_log "${STEP}" "network-down" "failure" "set BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION=1 first" 1
exit 1
fi
case "${2:-}" in
on)
write_fault_flag "fault-injection-network-down"
;;
off)
clear_fault_flag "fault-injection-network-down"
;;
*)
echo "usage: $0 network-down on|off" >&2
exit 2
;;
esac
;;
*)
cat <<'EOF'
usage:
blitz-fault-inject.sh bside-crash
blitz-fault-inject.sh bside-process-freeze
blitz-fault-inject.sh bside-video-thread-stall
blitz-fault-inject.sh bside-control-thread-stall
blitz-fault-inject.sh ros-crash
blitz-fault-inject.sh ros-freeze
blitz-fault-inject.sh network-down on|off
EOF
exit 2
;;
esac

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="incident-launch"
incident_id=""
args=()
timeout_bin=""
while (($# > 0)); do
case "$1" in
--incident-id)
incident_id="${2:-}"
shift 2
;;
*)
args+=("$1")
shift
;;
esac
done
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemd-run "${STEP}"
blitz_require_command timeout "${STEP}"
timeout_bin="$(command -v timeout)"
if [[ -z "${incident_id}" ]]; then
incident_id="$(blitz_new_incident_id)"
fi
unit_name="blitz-incident-${incident_id//[^A-Za-z0-9_.-]/-}"
systemd-run \
--quiet \
--collect \
--unit "${unit_name}" \
--property=Type=oneshot \
--property="StandardOutput=append:${BLITZ_LOG_FILE}" \
--property="StandardError=append:${BLITZ_LOG_FILE}" \
"${timeout_bin}" "${BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC}s" \
/bin/bash "${SCRIPT_DIR}/blitz-incident-capture.sh" \
--incident-id "${incident_id}" \
"${args[@]}"
printf '%s\n' "${incident_id}"

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="incident-capture"
incident_id=""
incident_source=""
incident_reason=""
incident_unit=""
incident_result=""
incident_exit_status=""
run_capture() {
local output_path="$1"
shift
if command -v timeout >/dev/null 2>&1; then
timeout "${BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC}s" "$@" > "${output_path}" 2>&1 || true
else
"$@" > "${output_path}" 2>&1 || true
fi
}
while (($# > 0)); do
case "$1" in
--incident-id)
incident_id="${2:-}"
shift 2
;;
--source)
incident_source="${2:-}"
shift 2
;;
--reason)
incident_reason="${2:-}"
shift 2
;;
--unit)
incident_unit="${2:-}"
shift 2
;;
--result)
incident_result="${2:-}"
shift 2
;;
--exit-status)
incident_exit_status="${2:-}"
shift 2
;;
*)
blitz_log "${STEP}" "parse-arg" "failure" "unknown argument: $1" 2
exit 2
;;
esac
done
if [[ -n "${incident_result}" && "${incident_result}" == "success" ]]; then
exit 0
fi
blitz_load_boot_env
blitz_load_run_context_env || true
blitz_prepare_runtime_dir
blitz_prepare_run_root
if [[ -z "${incident_id}" ]]; then
incident_id="$(blitz_new_incident_id)"
fi
incident_dir="${BLITZ_RUN_ROOT}/incidents/${incident_id}"
mkdir -p "${incident_dir}"
python3 - "${incident_dir}/incident.json" "${incident_id}" "${BLITZ_RUN_ID:-}" "${incident_source}" "${incident_reason}" "${incident_unit}" "${incident_result}" "${incident_exit_status}" "${BLITZ_RUN_DIR:-}" "${HOSTNAME:-$(hostname)}" <<'PY'
import json
import sys
import time
path, incident_id, run_id, source, reason, unit, result, exit_status, run_dir, hostname = sys.argv[1:10]
payload = {
"incident_id": incident_id,
"run_id": run_id,
"source": source,
"fault_reason": reason,
"unit": unit,
"service_result": result,
"exit_status": exit_status,
"run_dir": run_dir,
"hostname": hostname,
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
with open(path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
PY
for status_file in \
"${BLITZ_RUNTIME_DIR}/watchdog.status.json" \
"${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json" \
"${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
do
if [[ -f "${status_file}" ]]; then
cp -f "${status_file}" "${incident_dir}/$(basename "${status_file}")"
fi
done
if [[ -f "${BLITZ_LOG_FILE}" ]]; then
tail -n 400 "${BLITZ_LOG_FILE}" > "${incident_dir}/startup.log.tail"
fi
run_capture "${incident_dir}/systemctl-status.txt" \
systemctl status blitz-robot.target blitz-run-context.service blitz-5g-dial.service blitz-5g-link-logger.service blitz-ros-receiver.service blitz-b-side-omnid.service blitz-watchdog.service
run_capture "${incident_dir}/journal.txt" \
journalctl --no-pager --since "5 minutes ago" -u blitz-run-context.service -u blitz-5g-dial.service -u blitz-5g-link-logger.service -u blitz-ros-receiver.service -u blitz-b-side-omnid.service -u blitz-watchdog.service
run_capture "${incident_dir}/ip-addr.txt" ip addr
run_capture "${incident_dir}/ip-route.txt" ip route
run_capture "${incident_dir}/ss-uapn.txt" ss -uapn
run_capture "${incident_dir}/ss-xlp.txt" ss -xlp
if [[ -f "${BLITZ_5G_INFO_JSON:-}" ]]; then
cp -f "${BLITZ_5G_INFO_JSON}" "${incident_dir}/$(basename "${BLITZ_5G_INFO_JSON}")"
fi
if [[ -n "${BLITZ_RUN_DIR:-}" && -d "${BLITZ_RUN_DIR}" ]]; then
while IFS= read -r -d '' jsonl; do
tail -n 200 "${jsonl}" > "${incident_dir}/tail-$(basename "${jsonl}")"
done < <(find "${BLITZ_RUN_DIR}" -maxdepth 1 -type f -name '*.jsonl' -print0 2>/dev/null)
fi
blitz_log "${STEP}" "complete" "success" "incident_id=${incident_id} path=${incident_dir}" 0

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="run-context"
on_error() {
local rc="$?"
blitz_log "${STEP}" "error" "failure" "line=${1:-unknown} cmd=${BASH_COMMAND:-unknown}" "${rc}"
exit "${rc}"
}
trap 'on_error "${LINENO}"' ERR
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command python3 "${STEP}"
blitz_init_run_context
blitz_log "${STEP}" "complete" "success" "run_id=${BLITZ_RUN_ID} run_dir=${BLITZ_RUN_DIR}" 0

View File

@@ -0,0 +1,971 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="watchdog"
B_SIDE_SERVICE="blitz-b-side-omnid.service"
ROS_SERVICE="blitz-ros-receiver.service"
B_SIDE_STATUS_FILE=""
ROS_STATUS_FILE=""
WATCHDOG_STATUS_FILE=""
NETWORK_FAULT_FILE=""
WATCHDOG_EVENT_LOG=""
WATCHDOG_SAMPLE_LOG=""
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
CAMERA_MISSING_PREV=0
CAMERA_RECOVERY_STABLE_COUNT=0
NETWORK_FAIL_COUNT=0
NETWORK_COOLDOWN_UNTIL=0
BACKOFF_UNTIL=0
LAST_ACTION="none"
LAST_ACTION_EPOCH_MS=0
FULL_RESTART_WINDOW_START=0
FULL_RESTART_WINDOW_COUNT=0
NETWORK_LAST_INTERFACE=""
NETWORK_ROUTE_INTERFACE_LAST_KNOWN=""
NETWORK_PRIMARY_LAST_RETRY_SEC=0
GPS_LAST_CHECK_SEC=0
GPS_DEVICE_PRESENT_PREV=-1
GPS_DEVICE_PRESENT_STATE=1
GPS_STACK_ACTIVE_STATE=1
LAST_REPORTED_FAULT_REASON=""
LAST_REPORTED_RECOVERY_STATE=""
declare -A TARGETED_RESTART_WINDOW_START=()
declare -A TARGETED_RESTART_WINDOW_COUNT=()
now_epoch_sec() {
date +%s
}
now_epoch_ms() {
date +%s%3N
}
service_is_active() {
systemctl is-active --quiet "$1"
}
gps_monitor_enabled() {
[[ "${BLITZ_GPS_MONITOR_ENABLED:-0}" == "1" ]]
}
gps_stack_active() {
local units=()
local unit
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
if (( ${#units[@]} == 0 )); then
return 1
fi
for unit in "${units[@]}"; do
if service_is_active "${unit}"; then
return 0
fi
done
return 1
}
restart_gps_stack() {
local reason="$1"
local devices="$2"
local units=()
local rc
read -r -a units <<< "${BLITZ_GPS_RESTART_UNITS:-}"
if (( ${#units[@]} == 0 )); then
GPS_STACK_ACTIVE_STATE=0
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=empty" 1
return 1
fi
set_last_action "gps-reconnect"
blitz_log "${STEP}" "gps-reconnect" "start" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
if systemctl restart "${units[@]}"; then
GPS_STACK_ACTIVE_STATE=1
blitz_log "${STEP}" "gps-reconnect" "success" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" 0
return 0
fi
rc=$?
GPS_STACK_ACTIVE_STATE=0
blitz_log "${STEP}" "gps-reconnect" "failure" "reason=${reason} devices=${devices} units=${BLITZ_GPS_RESTART_UNITS}" "${rc}"
return "${rc}"
}
check_gps_health() {
local now_sec="$1"
local check_interval_sec="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
local device_glob="${BLITZ_GPS_DEVICE_GLOB:-}"
local previous_present="${GPS_DEVICE_PRESENT_PREV}"
local recovery_reason=""
local device_summary=""
local -a devices=()
if ! gps_monitor_enabled; then
GPS_DEVICE_PRESENT_STATE=1
GPS_STACK_ACTIVE_STATE=1
return 0
fi
if (( check_interval_sec < 1 )); then
check_interval_sec=1
fi
if (( GPS_LAST_CHECK_SEC != 0 && now_sec - GPS_LAST_CHECK_SEC < check_interval_sec )); then
if (( GPS_DEVICE_PRESENT_STATE == 1 && GPS_STACK_ACTIVE_STATE == 1 )); then
return 0
fi
return 1
fi
GPS_LAST_CHECK_SEC="${now_sec}"
mapfile -t devices < <(compgen -G "${device_glob}" || true)
if (( ${#devices[@]} == 0 )); then
GPS_DEVICE_PRESENT_STATE=0
GPS_STACK_ACTIVE_STATE=0
if (( previous_present != 0 )); then
blitz_log "${STEP}" "gps-device-check" "failure" "state=missing glob=${device_glob}" 1
fi
GPS_DEVICE_PRESENT_PREV=0
return 1
fi
device_summary="$(IFS=,; printf '%s' "${devices[*]}")"
GPS_DEVICE_PRESENT_STATE=1
GPS_DEVICE_PRESENT_PREV=1
if (( previous_present == 0 )); then
blitz_log "${STEP}" "gps-device-check" "success" "state=reappeared devices=${device_summary}" 0
recovery_reason="device-reappeared"
elif ! gps_stack_active; then
recovery_reason="gpsd-inactive"
fi
if [[ -n "${recovery_reason}" ]]; then
if restart_gps_stack "${recovery_reason}" "${device_summary}"; then
return 0
fi
return 1
fi
GPS_STACK_ACTIVE_STATE=1
return 0
}
status_file_fresh() {
local path="$1"
local max_age_sec="$2"
local now_sec
local mtime_sec
if [[ ! -f "${path}" ]]; then
return 1
fi
now_sec="$(now_epoch_sec)"
mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)"
(( now_sec - mtime_sec <= max_age_sec ))
}
ros_receiver_status_fresh() {
local path="$1"
local max_age_sec="$2"
local now_epoch_ms_value
now_epoch_ms_value="$(now_epoch_ms)"
python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY'
import json
import sys
path = sys.argv[1]
now_epoch_ms = int(sys.argv[2])
max_age_ms = int(sys.argv[3]) * 1000
try:
with open(path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
except Exception:
raise SystemExit(1)
heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0)
socket_bound = bool(payload.get("socket_bound"))
if heartbeat_ms <= 0 or not socket_bound:
raise SystemExit(1)
raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1)
PY
}
ros_receiver_healthy() {
local max_age_sec="$1"
service_is_active "${ROS_SERVICE}" \
&& [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \
&& status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \
&& ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}"
}
write_watchdog_status() {
local fault_reason="$1"
local recovery_state="$2"
local network_ok="$3"
local camera_ok="$4"
local ros_ok="$5"
local bside_ok="$6"
local gps_ok="$7"
local gps_device_present="$8"
local tmp_file
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
cat > "${tmp_file}" <<EOF
{
"updated_at_epoch_ms": $(now_epoch_ms),
"fault_reason": "${fault_reason}",
"recovery_state": "${recovery_state}",
"network_ok": ${network_ok},
"camera_ok": ${camera_ok},
"ros_ok": ${ros_ok},
"bside_ok": ${bside_ok},
"gps_ok": ${gps_ok},
"gps_device_present": ${gps_device_present},
"network_fail_count": ${NETWORK_FAIL_COUNT},
"targeted_restart_count": $(targeted_restart_total),
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
"last_action": "${LAST_ACTION}",
"last_action_epoch_ms": ${LAST_ACTION_EPOCH_MS}
}
EOF
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
}
watchdog_emit_json() {
local record_type="$1"
local action="$2"
local fault_reason="$3"
local recovery_state="$4"
local detail="$5"
local incident_id="${6:-}"
local network_ok="${7:-1}"
local camera_ok="${8:-1}"
local ros_ok="${9:-1}"
local bside_ok="${10:-1}"
local gps_ok="${11:-1}"
local gps_device_present="${12:-1}"
python3 - "${record_type}" "${action}" "${fault_reason}" "${recovery_state}" "${detail}" "${incident_id}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}" "${LAST_ACTION}" "${LAST_ACTION_EPOCH_MS}" "${NETWORK_FAIL_COUNT}" "$(targeted_restart_total)" "${FULL_RESTART_WINDOW_COUNT}" <<'PY'
import json
import sys
import time
record_type, action, fault_reason, recovery_state, detail, incident_id, network_ok, camera_ok, ros_ok, bside_ok, gps_ok, gps_device_present, last_action, last_action_epoch_ms, network_fail_count, targeted_restart_count, full_restart_count = sys.argv[1:18]
payload = {
"ts_unix_ms": time.time_ns() // 1_000_000,
"record_type": record_type,
"action": action,
"fault_reason": fault_reason,
"recovery_state": recovery_state,
"detail": detail,
"incident_id": incident_id or None,
"network_ok": network_ok == "1",
"camera_ok": camera_ok == "1",
"ros_ok": ros_ok == "1",
"bside_ok": bside_ok == "1",
"gps_ok": gps_ok == "1",
"gps_device_present": gps_device_present == "1",
"network_fail_count": int(network_fail_count),
"targeted_restart_count": int(targeted_restart_count),
"full_restart_count": int(full_restart_count),
"last_action": last_action,
"last_action_epoch_ms": int(last_action_epoch_ms or 0),
}
print(json.dumps(payload, separators=(",", ":"), ensure_ascii=False))
PY
}
watchdog_append_event() {
local line=""
[[ -n "${WATCHDOG_EVENT_LOG}" ]] || return 0
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=${line}" 0 || true
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
fi
return 0
fi
if ! blitz_jsonl_append_line "${WATCHDOG_EVENT_LOG}" "${line}"; then
if (( WATCHDOG_EVENT_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-event-log" "failure" "path=${WATCHDOG_EVENT_LOG} detail=append-failed" 0 || true
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=1
fi
return 0
fi
WATCHDOG_EVENT_LOG_FAILURE_REPORTED=0
}
watchdog_append_sample() {
local line=""
[[ -n "${WATCHDOG_SAMPLE_LOG}" ]] || return 0
if ! line="$(watchdog_emit_json "$@" 2>&1)"; then
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=${line}" 0 || true
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
fi
return 0
fi
if ! blitz_jsonl_append_line "${WATCHDOG_SAMPLE_LOG}" "${line}"; then
if (( WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED == 0 )); then
blitz_log "${STEP}" "watchdog-sample-log" "failure" "path=${WATCHDOG_SAMPLE_LOG} detail=append-failed" 0 || true
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=1
fi
return 0
fi
WATCHDOG_SAMPLE_LOG_FAILURE_REPORTED=0
}
watchdog_record_state_transition() {
local fault_reason="$1"
local recovery_state="$2"
if [[ "${fault_reason}" == "${LAST_REPORTED_FAULT_REASON}" && "${recovery_state}" == "${LAST_REPORTED_RECOVERY_STATE}" ]]; then
return 0
fi
watchdog_append_event "event" "state-transition" "${fault_reason}" "${recovery_state}" "" ""
LAST_REPORTED_FAULT_REASON="${fault_reason}"
LAST_REPORTED_RECOVERY_STATE="${recovery_state}"
}
watchdog_launch_incident() {
local reason="$1"
local unit_name="$2"
blitz_launch_incident_capture \
--source watchdog \
--reason "${reason}" \
--unit "${unit_name}" \
--result failure \
--exit-status 1 2>/dev/null || true
}
set_last_action() {
LAST_ACTION="$1"
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
}
targeted_restart_total() {
local total=0
local key
for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do
total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] ))
done
printf '%s\n' "${total}"
}
register_targeted_restart() {
local fault_key="$1"
local now_sec
local window_start
local count
now_sec="$(now_epoch_sec)"
window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}"
count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}"
if (( window_start == 0 || now_sec - window_start > 60 )); then
window_start="${now_sec}"
count=1
else
count=$(( count + 1 ))
fi
TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}"
TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}"
(( count >= 2 ))
}
record_full_restart() {
local now_sec
now_sec="$(now_epoch_sec)"
if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then
FULL_RESTART_WINDOW_START="${now_sec}"
FULL_RESTART_WINDOW_COUNT=1
else
FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 ))
fi
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
BACKOFF_UNTIL=$(( now_sec + 60 ))
watchdog_append_event "event" "backoff-enter" "backoff" "backoff" "full_restart_count=${FULL_RESTART_WINDOW_COUNT}" ""
fi
}
restart_bside_targeted() {
local fault_key="$1"
local reason="$2"
local rc
local incident_id=""
if register_targeted_restart "${fault_key}"; then
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
watchdog_append_event "event" "escalate-full-restart" "${reason}-escalated" "recovering" "fault_key=${fault_key}" ""
full_restart_stack "${reason}-escalated"
return 0
fi
incident_id="$(watchdog_launch_incident "${reason}" "${B_SIDE_SERVICE}")"
set_last_action "restart-bside"
RECOVERY_ACTION_TAKEN=1
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
watchdog_append_event "event" "restart-bside-start" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
if systemctl restart "${B_SIDE_SERVICE}"; then
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
watchdog_append_event "event" "restart-bside-success" "${reason}" "recovering" "fault_key=${fault_key}" "${incident_id}"
return 0
fi
rc=$?
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
watchdog_append_event "event" "restart-bside-failure" "${reason}" "recovering" "fault_key=${fault_key} rc=${rc}" "${incident_id}"
return "${rc}"
}
full_restart_stack() {
local reason="$1"
local rc
local incident_id=""
incident_id="$(watchdog_launch_incident "${reason}" "blitz-robot.target")"
set_last_action "full-restart"
RECOVERY_ACTION_TAKEN=1
recovery_state="recovering"
fault_reason="${reason}"
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
watchdog_append_event "event" "full-restart-start" "${reason}" "recovering" "" "${incident_id}"
systemctl stop "${B_SIDE_SERVICE}" || true
if systemctl restart "${ROS_SERVICE}"; then
blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0
else
rc=$?
blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
if bash "${BOOT_SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then
:
else
rc=$?
blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
if systemctl start "${B_SIDE_SERVICE}"; then
blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0
else
rc=$?
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
watchdog_append_event "event" "full-restart-failure" "${reason}" "recovering" "stage=start-bside rc=${rc}" "${incident_id}"
record_full_restart
return "${rc}"
fi
watchdog_append_event "event" "full-restart-success" "${reason}" "recovering" "" "${incident_id}"
record_full_restart
}
network_fault_injected() {
[[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]]
}
resolve_network_interface() {
NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)"
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${NETWORK_LAST_INTERFACE}"
return 0
fi
return 1
}
network_route_targets() {
local target
if [[ -n "${BLITZ_TIME_SERVER_IP:-}" ]]; then
printf '%s\n' "${BLITZ_TIME_SERVER_IP}"
fi
for target in ${BLITZ_5G_ROUTE_TARGETS//,/ }; do
if [[ -n "${target}" && "${target}" != "${BLITZ_TIME_SERVER_IP:-}" ]]; then
printf '%s\n' "${target}"
fi
done
}
log_target_route_paths() {
local action="$1"
local target
local route_output
while IFS= read -r target; do
[[ -n "${target}" ]] || continue
route_output="$(ip route get "${target}" 2>&1 | head -n 1 || true)"
if [[ -z "${route_output}" ]]; then
route_output="unresolved"
fi
blitz_log "${STEP}" "route-path" "info" "action=${action} target=${target} route=${route_output}" 0
done < <(network_route_targets)
}
route_output_uses_interface() {
local route_output="$1"
local interface_name="$2"
[[ -n "${interface_name}" ]] || return 1
[[ "${route_output}" == *" dev ${interface_name} "* || "${route_output}" == *" dev ${interface_name}" ]]
}
route_output_uses_gateway() {
local route_output="$1"
local gateway="$2"
[[ -n "${gateway}" ]] || return 1
[[ "${route_output}" == *"via ${gateway}"* ]]
}
route_is_desired_target_route() {
local route_output="$1"
local interface_name="$2"
local gateway="$3"
route_output_uses_interface "${route_output}" "${interface_name}" \
&& route_output_uses_gateway "${route_output}" "${gateway}"
}
route_is_managed_5g_route() {
local route_output="$1"
local interface_name="${2:-}"
local gateway="${3:-}"
if route_output_uses_interface "${route_output}" "${interface_name}"; then
return 0
fi
if route_output_uses_gateway "${route_output}" "${gateway}"; then
return 0
fi
if route_output_uses_gateway "${route_output}" "${BLITZ_5G_GATEWAY:-}"; then
return 0
fi
return 1
}
resolve_route_cleanup_interface() {
local interface_name=""
local info_json="${BLITZ_5G_INFO_JSON:-}"
if [[ -n "${NETWORK_LAST_INTERFACE}" ]]; then
printf '%s\n' "${NETWORK_LAST_INTERFACE}"
return 0
fi
if [[ -n "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}" ]]; then
printf '%s\n' "${NETWORK_ROUTE_INTERFACE_LAST_KNOWN}"
return 0
fi
interface_name="$(blitz_read_5g_info_interface "${info_json}" || true)"
if [[ -n "${interface_name}" ]]; then
printf '%s\n' "${interface_name}"
return 0
fi
return 1
}
resolve_network_gateway() {
local interface_name="$1"
local default_route
local gateway=""
local tokens=()
local index
default_route="$(ip -o route show default dev "${interface_name}" 2>/dev/null | head -n 1 || true)"
if [[ -n "${default_route}" ]]; then
read -r -a tokens <<< "${default_route}"
for (( index=0; index<${#tokens[@]}-1; index++ )); do
if [[ "${tokens[index]}" == "via" ]]; then
gateway="${tokens[index + 1]}"
break
fi
done
fi
if [[ -n "${gateway}" ]]; then
printf '%s\n' "${gateway}"
return 0
fi
if [[ -n "${BLITZ_5G_GATEWAY:-}" ]]; then
printf '%s\n' "${BLITZ_5G_GATEWAY}"
return 0
fi
return 1
}
sync_target_routes_to_5g() {
local interface_name="$1"
local gateway="${2:-}"
local route_output=""
local updated=0
local target
local rc
if [[ -z "${interface_name}" ]]; then
return 1
fi
if [[ -z "${gateway}" ]]; then
gateway="$(resolve_network_gateway "${interface_name}" || true)"
fi
if [[ -z "${gateway}" ]]; then
blitz_log "${STEP}" "route-sync-gateway" "failure" "interface=${interface_name}" 1
return 1
fi
while IFS= read -r target; do
[[ -n "${target}" ]] || continue
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
if [[ -n "${route_output}" ]] && route_is_desired_target_route "${route_output}" "${interface_name}" "${gateway}"; then
continue
fi
if ip route replace "${target}/32" via "${gateway}" dev "${interface_name}"; then
updated=1
blitz_log "${STEP}" "route-sync-target" "success" "target=${target} interface=${interface_name} gateway=${gateway}" 0
else
rc=$?
blitz_log "${STEP}" "route-sync-target" "failure" "target=${target} interface=${interface_name} gateway=${gateway}" "${rc}"
return "${rc}"
fi
done < <(network_route_targets)
if (( updated == 1 )); then
NETWORK_ROUTE_INTERFACE_LAST_KNOWN="${interface_name}"
log_target_route_paths "sync-to-5g"
fi
return 0
}
clear_target_routes_from_5g() {
local interface_name="${1:-}"
local gateway="${2:-}"
local route_output=""
local target
local removed_any=0
local rc
if [[ -z "${interface_name}" ]]; then
interface_name="$(resolve_route_cleanup_interface || true)"
fi
if [[ -z "${gateway}" && -n "${interface_name}" ]]; then
gateway="$(resolve_network_gateway "${interface_name}" || true)"
fi
if [[ -z "${gateway}" ]]; then
gateway="${BLITZ_5G_GATEWAY:-}"
fi
while IFS= read -r target; do
[[ -n "${target}" ]] || continue
route_output="$(ip route show "${target}/32" 2>/dev/null | head -n 1 || true)"
if [[ -z "${route_output}" ]] || ! route_is_managed_5g_route "${route_output}" "${interface_name}" "${gateway}"; then
continue
fi
if ip route del "${target}/32"; then
removed_any=1
blitz_log "${STEP}" "route-clear-target" "success" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
else
rc=$?
blitz_log "${STEP}" "route-clear-target" "failure" "target=${target} interface=${interface_name:-unknown} gateway=${gateway:-unknown}" "${rc}"
return "${rc}"
fi
done < <(network_route_targets)
if (( removed_any == 1 )); then
blitz_log "${STEP}" "route-clear" "success" "interface=${interface_name:-unknown} gateway=${gateway:-unknown}" 0
log_target_route_paths "clear-from-5g"
fi
return 0
}
repair_network_routes() {
local interface_name="$1"
local gateway=""
local route_output
if [[ -z "${interface_name}" ]]; then
return 1
fi
gateway="$(resolve_network_gateway "${interface_name}" || true)"
if [[ -z "${gateway}" ]]; then
blitz_log "${STEP}" "route-repair-gateway" "failure" "interface=${interface_name}" 1
return 1
fi
if ! sync_target_routes_to_5g "${interface_name}" "${gateway}"; then
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${interface_name}" || true)"
if [[ -z "${route_output}" ]]; then
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
blitz_log "${STEP}" "route-repair-postcheck" "failure" "interface=${interface_name} gateway=${gateway}" 1
return 1
fi
if ! ping -I "${interface_name}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1; then
clear_target_routes_from_5g "${interface_name}" "${gateway}" || true
blitz_log "${STEP}" "route-repair-probe" "failure" "interface=${interface_name} target=${BLITZ_TIME_SERVER_IP}" 1
return 1
fi
blitz_log "${STEP}" "route-repair-postcheck" "success" "interface=${interface_name} gateway=${gateway} route=${route_output}" 0
return 0
}
network_is_healthy() {
local route_output
NETWORK_LAST_INTERFACE=""
if network_fault_injected; then
return 1
fi
if ! resolve_network_interface; then
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)"
if [[ -z "${route_output}" ]]; then
return 1
fi
ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
}
fallback_network_is_healthy() {
local route_output
if [[ -z "${BLITZ_TIME_SERVER_IP:-}" ]]; then
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" || true)"
if [[ -z "${route_output}" ]]; then
return 1
fi
ping -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
}
wait_for_network_recovery() {
local timeout_sec="$1"
local waited=0
while (( waited < timeout_sec )); do
if network_is_healthy; then
blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0
return 0
fi
if (( waited == 0 || waited % 5 == 0 )); then
blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0
fi
sleep 1
waited=$(( waited + 1 ))
done
blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1
return 1
}
perform_network_recovery() {
local rc=0
local incident_id=""
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
set_last_action "route-repair"
RECOVERY_ACTION_TAKEN=1
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
NETWORK_FAIL_COUNT=0
blitz_log "${STEP}" "network-recovery" "success" "mode=route-repair interface=${NETWORK_LAST_INTERFACE}" 0
watchdog_append_event "event" "route-repair-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE}" ""
return 0
fi
incident_id="$(watchdog_launch_incident "network-recovery" "blitz-5g-dial.service")"
set_last_action "network-recovery"
RECOVERY_ACTION_TAKEN=1
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
watchdog_append_event "event" "network-recovery-start" "network_or_robot_unreachable" "recovering" "fail_count=${NETWORK_FAIL_COUNT}" "${incident_id}"
systemctl stop "${B_SIDE_SERVICE}" || true
if bash "${BOOT_SCRIPT_DIR}/5g-dial.sh"; then
:
else
rc=$?
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT} script=${BOOT_SCRIPT_DIR}/5g-dial.sh" "${rc}"
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=redial rc=${rc}" "${incident_id}"
return "${rc}"
fi
if wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then
:
else
rc=$?
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
watchdog_append_event "event" "network-recovery-failure" "network_or_robot_unreachable" "recovering" "stage=postcheck rc=${rc}" "${incident_id}"
return "${rc}"
fi
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
NETWORK_FAIL_COUNT=0
watchdog_append_event "event" "network-recovery-success" "network_or_robot_unreachable" "recovering" "interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${incident_id}"
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
restart_bside_targeted "network" "network-recovered"
return 0
fi
full_restart_stack "network-recovered-ros-unhealthy"
return 0
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
blitz_require_command stat "${STEP}"
blitz_require_command ping "${STEP}"
blitz_require_command python3 "${STEP}"
blitz_prepare_runtime_dir
blitz_require_run_context
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
WATCHDOG_EVENT_LOG="${BLITZ_RUN_DIR}/watchdog-events.jsonl"
WATCHDOG_SAMPLE_LOG="${BLITZ_RUN_DIR}/watchdog-samples.jsonl"
while true; do
fault_reason="none"
recovery_state="ok"
network_ok=1
camera_ok=1
ros_ok=1
bside_ok=1
gps_ok=1
gps_device_present=1
RECOVERY_ACTION_TAKEN=0
now_sec="$(now_epoch_sec)"
if gps_monitor_enabled; then
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
if (( GPS_DEVICE_PRESENT_STATE == 0 || GPS_STACK_ACTIVE_STATE == 0 )); then
gps_ok=0
fi
fi
if (( BACKOFF_UNTIL > now_sec )); then
fault_reason="backoff"
recovery_state="backoff"
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" 0 0 0 0 "${gps_ok}" "${gps_device_present}"
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
continue
fi
if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then
recovery_state="recovering"
elif ! network_is_healthy; then
clear_target_routes_from_5g || true
if fallback_network_is_healthy; then
NETWORK_FAIL_COUNT=0
fault_reason="network_fallback_active"
recovery_state="degraded"
blitz_log "${STEP}" "network-check" "fallback" "interface=${NETWORK_LAST_INTERFACE:-unresolved} target=${BLITZ_TIME_SERVER_IP}" 0
if (( NETWORK_PRIMARY_LAST_RETRY_SEC == 0 || now_sec - NETWORK_PRIMARY_LAST_RETRY_SEC >= 10 )); then
NETWORK_PRIMARY_LAST_RETRY_SEC="${now_sec}"
if resolve_network_interface && repair_network_routes "${NETWORK_LAST_INTERFACE}"; then
NETWORK_PRIMARY_LAST_RETRY_SEC=0
fault_reason="none"
recovery_state="ok"
blitz_log "${STEP}" "network-check" "primary-restored" "interface=${NETWORK_LAST_INTERFACE} target=${BLITZ_TIME_SERVER_IP}" 0
log_target_route_paths "primary-restored"
fi
fi
else
network_ok=0
NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 ))
fault_reason="network_or_robot_unreachable"
recovery_state="recovering"
blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1
if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then
perform_network_recovery || true
fi
fi
else
NETWORK_PRIMARY_LAST_RETRY_SEC=0
NETWORK_FAIL_COUNT=0
sync_target_routes_to_5g "${NETWORK_LAST_INTERFACE}" || true
fi
if check_gps_health "${now_sec}"; then
gps_ok=1
else
gps_ok=0
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
if [[ "${fault_reason}" == "none" ]]; then
if (( GPS_DEVICE_PRESENT_STATE == 0 )); then
fault_reason="gps_device_missing"
else
fault_reason="gps_reconnect_failed"
fi
recovery_state="degraded"
fi
fi
gps_device_present="${GPS_DEVICE_PRESENT_STATE}"
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
camera_ok=0
fault_reason="camera_missing"
recovery_state="degraded"
CAMERA_MISSING_PREV=1
CAMERA_RECOVERY_STABLE_COUNT=0
elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then
CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 ))
recovery_state="recovering"
fault_reason="camera_recovered"
if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then
restart_bside_targeted "camera" "camera-reappeared" || true
CAMERA_MISSING_PREV=0
CAMERA_RECOVERY_STABLE_COUNT=0
fi
else
CAMERA_RECOVERY_STABLE_COUNT=0
fi
if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then
bside_ok=0
fault_reason="bside_status_stale"
recovery_state="recovering"
restart_bside_targeted "bside" "bside-unhealthy" || true
fi
if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
ros_ok=0
fault_reason="ros_receiver_unhealthy"
recovery_state="recovering"
full_restart_stack "ros-unhealthy" || true
fi
watchdog_record_state_transition "${fault_reason}" "${recovery_state}"
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
watchdog_append_sample "sample" "loop" "${fault_reason}" "${recovery_state}" "" "" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" "${gps_ok}" "${gps_device_present}"
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
done

View File

@@ -28,13 +28,17 @@ blitz_host_from_addr() {
blitz_load_boot_env() { blitz_load_boot_env() {
local env_file local env_file
local default_time_server local default_time_server
local dev_run_root
local dev_runtime_dir
if [[ "${BLITZ_BOOT_ENV_LOADED:-0}" == "1" ]]; then if [[ "${BLITZ_BOOT_ENV_LOADED:-0}" == "1" ]]; then
return 0 return 0
fi fi
export BLITZ_BOOT_LOADING_ENV="1"
# shellcheck disable=SC1091 # shellcheck disable=SC1091
source "${DEV_SCRIPT_DIR}/load-env.sh" source "${DEV_SCRIPT_DIR}/load-env.sh"
unset BLITZ_BOOT_LOADING_ENV
for env_file in \ for env_file in \
"${BOOT_SCRIPT_DIR}/robot-boot.env" \ "${BOOT_SCRIPT_DIR}/robot-boot.env" \
@@ -48,10 +52,38 @@ blitz_load_boot_env() {
fi fi
done done
if declare -F normalize_loaded_env_vars >/dev/null 2>&1; then
normalize_loaded_env_vars
fi
dev_run_root="${OMNISOCKETGO_ROOT}/logs"
dev_runtime_dir="${dev_run_root}/runtime"
if [[ -z "${BLITZ_RUN_ROOT:-}" || "${BLITZ_RUN_ROOT}" == "${dev_run_root}" ]]; then
export BLITZ_RUN_ROOT="/var/log/blitz-robot"
fi
if [[ -z "${BLITZ_RUNTIME_DIR:-}" || "${BLITZ_RUNTIME_DIR}" == "${dev_runtime_dir}" ]]; then
export BLITZ_RUNTIME_DIR="/run/blitz-robot"
fi
if [[ -z "${BLITZ_RUN_CONTEXT_FILE:-}" || "${BLITZ_RUN_CONTEXT_FILE}" == "${dev_runtime_dir}/run-context.env" ]]; then
export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUNTIME_DIR}/run-context.env"
fi
if [[ -z "${BLITZ_RUN_ID_FILE:-}" || "${BLITZ_RUN_ID_FILE}" == "${dev_runtime_dir}/run-id" ]]; then
export BLITZ_RUN_ID_FILE="${BLITZ_RUNTIME_DIR}/run-id"
fi
if [[ -z "${BLITZ_CURRENT_RUN_LINK:-}" || "${BLITZ_CURRENT_RUN_LINK}" == "${dev_run_root}/current" ]]; then
export BLITZ_CURRENT_RUN_LINK="${BLITZ_RUN_ROOT}/current"
fi
default_time_server="$(blitz_host_from_addr "${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}" || true)" default_time_server="$(blitz_host_from_addr "${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}" || true)"
export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}" export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}"
export BLITZ_RUN_ROOT="${BLITZ_RUN_ROOT:-/var/log/blitz-robot}"
export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}" export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}"
export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-/run/blitz-robot}"
export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUN_CONTEXT_FILE:-${BLITZ_RUNTIME_DIR}/run-context.env}"
export BLITZ_RUN_ID_FILE="${BLITZ_RUN_ID_FILE:-${BLITZ_RUNTIME_DIR}/run-id}"
export BLITZ_CURRENT_RUN_LINK="${BLITZ_CURRENT_RUN_LINK:-${BLITZ_RUN_ROOT}/current}"
export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}" export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}"
export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}" export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}"
export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}" export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}"
@@ -65,12 +97,28 @@ blitz_load_boot_env() {
export BLITZ_5G_SERIAL_WAIT_SEC="${BLITZ_5G_SERIAL_WAIT_SEC:-60}" export BLITZ_5G_SERIAL_WAIT_SEC="${BLITZ_5G_SERIAL_WAIT_SEC:-60}"
export BLITZ_5G_ROUTE_WAIT_SEC="${BLITZ_5G_ROUTE_WAIT_SEC:-30}" export BLITZ_5G_ROUTE_WAIT_SEC="${BLITZ_5G_ROUTE_WAIT_SEC:-30}"
export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-${default_time_server}}" export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-${default_time_server}}"
export BLITZ_TIME_SERVER_PORT="${BLITZ_TIME_SERVER_PORT:-123}"
export BLITZ_TIME_SYNC_WAIT_SEC="${BLITZ_TIME_SYNC_WAIT_SEC:-60}"
export BLITZ_TIME_SYNC_MAX_OFFSET_SEC="${BLITZ_TIME_SYNC_MAX_OFFSET_SEC:-0.002}"
export BLITZ_TIME_SYNC_INTERVAL_SEC="${BLITZ_TIME_SYNC_INTERVAL_SEC:-1}"
export BLITZ_ROS_USER="${BLITZ_ROS_USER:-nvidia}" export BLITZ_ROS_USER="${BLITZ_ROS_USER:-nvidia}"
export BLITZ_ROS_SOCKET_WAIT_SEC="${BLITZ_ROS_SOCKET_WAIT_SEC:-20}" export BLITZ_ROS_SOCKET_WAIT_SEC="${BLITZ_ROS_SOCKET_WAIT_SEC:-20}"
export BLITZ_WATCHDOG_INTERVAL_SEC="${BLITZ_WATCHDOG_INTERVAL_SEC:-5}"
export BLITZ_HEALTH_STALE_SEC="${BLITZ_HEALTH_STALE_SEC:-15}"
export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}"
export BLITZ_KCP_STATS_INTERVAL_MS="${BLITZ_KCP_STATS_INTERVAL_MS:-1000}"
export BLITZ_CONTROL_LATENCY_LOG_ENABLED="${BLITZ_CONTROL_LATENCY_LOG_ENABLED:-1}"
export BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="${BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD:-100}"
export BLITZ_5G_LINK_LOG_INTERVAL_SEC="${BLITZ_5G_LINK_LOG_INTERVAL_SEC:-5}"
export BLITZ_JSONL_FLUSH_INTERVAL_MS="${BLITZ_JSONL_FLUSH_INTERVAL_MS:-1000}"
export BLITZ_JSONL_FLUSH_BYTES="${BLITZ_JSONL_FLUSH_BYTES:-262144}"
export BLITZ_JSONL_ROTATE_BYTES="${BLITZ_JSONL_ROTATE_BYTES:-134217728}"
export BLITZ_JSONL_ROTATE_FILES="${BLITZ_JSONL_ROTATE_FILES:-8}"
export BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC="${BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC:-5}"
export BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC="${BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC:-30}"
export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}"
export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}"
export BLITZ_GPS_MONITOR_ENABLED="${BLITZ_GPS_MONITOR_ENABLED:-1}"
export BLITZ_GPS_DEVICE_GLOB="${BLITZ_GPS_DEVICE_GLOB:-/dev/ttyCH341USB*}"
export BLITZ_GPS_CHECK_INTERVAL_SEC="${BLITZ_GPS_CHECK_INTERVAL_SEC:-10}"
export BLITZ_GPS_RESTART_UNITS="${BLITZ_GPS_RESTART_UNITS:-gpsd.socket gpsd.service}"
export BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION:-0}"
export BLITZ_BOOT_ENV_LOADED="1" export BLITZ_BOOT_ENV_LOADED="1"
} }
@@ -200,3 +248,414 @@ blitz_route_ready() {
printf '%s\n' "${route_output}" printf '%s\n' "${route_output}"
return 0 return 0
} }
blitz_interface_exists() {
local interface_name="${1:-}"
if [[ -z "${interface_name}" ]]; then
return 1
fi
ip link show dev "${interface_name}" >/dev/null 2>&1
}
blitz_read_5g_info_interface() {
local info_json="$1"
if [[ -z "${info_json}" || ! -f "${info_json}" ]]; then
return 1
fi
python3 - "${info_json}" <<'PY'
import json
import sys
path = sys.argv[1]
try:
with open(path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
except Exception:
raise SystemExit(1)
interface = str(payload.get("interface") or "").strip()
if not interface:
raise SystemExit(1)
print(interface)
PY
}
blitz_detect_5g_interface_from_subnet() {
local modem_subnet="${1:-${BLITZ_5G_MODEM_SUBNET:-}}"
if [[ -z "${modem_subnet}" ]]; then
return 1
fi
python3 - "${modem_subnet}" <<'PY'
import ipaddress
import json
import subprocess
import sys
subnet = ipaddress.ip_network(sys.argv[1], strict=False)
skip = {"lo", "docker0", "l4tbr0"}
def priority(name: str) -> tuple[int, str]:
if name.startswith("enx"):
return (0, name)
if name.startswith("wwan"):
return (1, name)
if name.startswith("usb"):
return (2, name)
if name.startswith("eth"):
return (3, name)
return (9, name)
try:
output = subprocess.check_output(["ip", "-j", "-4", "addr", "show"], text=True)
payload = json.loads(output)
except Exception:
raise SystemExit(1)
candidates = []
for item in payload:
ifname = str(item.get("ifname") or "").strip()
if not ifname or ifname in skip:
continue
for addr in item.get("addr_info") or []:
if addr.get("family") != "inet":
continue
local = addr.get("local")
prefixlen = addr.get("prefixlen")
if not local or prefixlen is None:
continue
try:
iface = ipaddress.ip_interface(f"{local}/{prefixlen}")
except ValueError:
continue
if iface.ip in subnet:
candidates.append((priority(ifname), ifname))
break
if not candidates:
raise SystemExit(1)
candidates.sort(key=lambda item: item[0])
print(candidates[0][1])
PY
}
blitz_refresh_5g_info_json() {
local interface_name="$1"
local info_json="${2:-${BLITZ_5G_INFO_JSON:-}}"
if [[ -z "${interface_name}" || -z "${info_json}" ]]; then
return 1
fi
python3 - "${interface_name}" "${info_json}" <<'PY'
import json
import os
import subprocess
import sys
interface_name = sys.argv[1]
path = sys.argv[2]
try:
output = subprocess.check_output(["ip", "-j", "addr", "show", "dev", interface_name], text=True)
payload = json.loads(output)
except Exception:
raise SystemExit(1)
if not payload:
raise SystemExit(1)
item = payload[0]
ipv4 = []
ipv6 = []
for addr in item.get("addr_info") or []:
local = addr.get("local")
prefixlen = addr.get("prefixlen")
family = addr.get("family")
if not local or prefixlen is None:
continue
entry = f"{local}/{prefixlen}"
if family == "inet":
ipv4.append(entry)
elif family == "inet6":
ipv6.append(entry)
data = {
"interface": interface_name,
"ipv4": ipv4,
"ipv6": ipv6,
}
parent = os.path.dirname(path)
if parent:
os.makedirs(parent, exist_ok=True)
temp_path = f"{path}.tmp.{os.getpid()}"
with open(temp_path, "w", encoding="utf-8") as handle:
json.dump(data, handle, ensure_ascii=False, indent=2)
os.replace(temp_path, path)
PY
}
blitz_resolve_5g_interface() {
local explicit_interface="${BLITZ_5G_INTERFACE:-}"
local info_json="${BLITZ_5G_INFO_JSON:-}"
local recorded_interface=""
local detected_interface=""
if [[ -n "${explicit_interface}" ]]; then
if blitz_interface_exists "${explicit_interface}"; then
printf '%s\n' "${explicit_interface}"
return 0
fi
return 1
fi
recorded_interface="$(blitz_read_5g_info_interface "${info_json}" || true)"
if [[ -n "${recorded_interface}" ]] && blitz_interface_exists "${recorded_interface}"; then
printf '%s\n' "${recorded_interface}"
return 0
fi
detected_interface="$(blitz_detect_5g_interface_from_subnet || true)"
if [[ -n "${detected_interface}" ]]; then
if [[ "${detected_interface}" != "${recorded_interface}" ]]; then
blitz_refresh_5g_info_json "${detected_interface}" "${info_json}" >/dev/null 2>&1 || true
fi
printf '%s\n' "${detected_interface}"
return 0
fi
return 1
}
blitz_prepare_runtime_dir() {
local runtime_dir
blitz_load_boot_env
runtime_dir="${BLITZ_RUNTIME_DIR}"
mkdir -p "${runtime_dir}"
if [[ "${EUID}" -eq 0 ]]; then
chown "root:${BLITZ_ROS_USER}" "${runtime_dir}"
chmod 0775 "${runtime_dir}"
else
chmod 0775 "${runtime_dir}" 2>/dev/null || true
fi
blitz_log "runtime-dir" "prepare" "success" "path=${runtime_dir}" 0
}
blitz_prepare_run_root() {
local run_root
local run_dir
local incidents_dir
blitz_load_boot_env
run_root="${BLITZ_RUN_ROOT}"
run_dir="${run_root}/runs"
incidents_dir="${run_root}/incidents"
mkdir -p "${run_dir}" "${incidents_dir}"
if [[ "${EUID}" -eq 0 ]]; then
chown -R "root:${BLITZ_ROS_USER}" "${run_root}" 2>/dev/null || true
chmod 0775 "${run_root}" "${run_dir}" "${incidents_dir}" 2>/dev/null || true
fi
}
blitz_load_run_context_env() {
local context_file="${1:-${BLITZ_RUN_CONTEXT_FILE:-}}"
if [[ -z "${context_file}" || ! -f "${context_file}" ]]; then
return 1
fi
set -a
# shellcheck disable=SC1090
source "${context_file}"
set +a
return 0
}
blitz_read_run_id() {
local run_id_file="${BLITZ_RUN_ID_FILE:-}"
if [[ -z "${run_id_file}" || ! -f "${run_id_file}" ]]; then
return 1
fi
tr -d '\r\n' < "${run_id_file}"
}
blitz_utc_compact_timestamp() {
date -u '+%Y%m%dT%H%M%SZ'
}
blitz_new_run_id() {
printf '%s\n' "$(blitz_utc_compact_timestamp)"
}
blitz_new_incident_id() {
local prefix="${1:-incident}"
printf '%s-%s-%d\n' "${prefix}" "$(blitz_utc_compact_timestamp)" "$$"
}
blitz_new_instance_id() {
printf '%s-%d\n' "$(blitz_utc_compact_timestamp)" "$$"
}
blitz_git_commit() {
git -C "${OMNISOCKETGO_ROOT}" rev-parse HEAD 2>/dev/null || true
}
blitz_git_dirty_flag() {
if git -C "${OMNISOCKETGO_ROOT}" diff --quiet --ignore-submodules=dirty >/dev/null 2>&1; then
printf '0\n'
return 0
fi
printf '1\n'
}
blitz_write_run_context() {
local run_id="$1"
local run_dir="$2"
local boot_id="$3"
local context_file="${BLITZ_RUN_CONTEXT_FILE}"
local id_file="${BLITZ_RUN_ID_FILE}"
local temp_context
local temp_info
local commit_hash
local dirty_flag
local started_at
commit_hash="$(blitz_git_commit)"
dirty_flag="$(blitz_git_dirty_flag)"
started_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
temp_context="${context_file}.tmp.$$"
temp_info="${run_dir}/run-info.json.tmp.$$"
mkdir -p "${run_dir}"
printf '%s\n' "${run_id}" > "${id_file}"
cat > "${temp_context}" <<EOF
BLITZ_RUN_ID=${run_id}
BLITZ_RUN_DIR=${run_dir}
BLITZ_BOOT_ID=${boot_id}
BLITZ_RUN_ROOT=${BLITZ_RUN_ROOT}
EOF
mv -f "${temp_context}" "${context_file}"
python3 - "${temp_info}" "${run_id}" "${run_dir}" "${boot_id}" "${started_at}" "${commit_hash}" "${dirty_flag}" "${HOSTNAME:-$(hostname)}" <<'PY'
import json
import os
import sys
path, run_id, run_dir, boot_id, started_at, commit_hash, dirty_flag, hostname = sys.argv[1:9]
payload = {
"run_id": run_id,
"run_dir": run_dir,
"boot_id": boot_id,
"started_at": started_at,
"hostname": hostname,
"git_commit": commit_hash,
"git_dirty": dirty_flag == "1",
"env": {
key: os.environ.get(key, "")
for key in sorted(os.environ)
if key.startswith(("BLITZ_", "OMNI_", "ROBOT_RECEIVER_"))
},
}
with open(path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
PY
mv -f "${temp_info}" "${run_dir}/run-info.json"
ln -sfn "${run_dir}" "${BLITZ_CURRENT_RUN_LINK}"
}
blitz_init_run_context() {
local run_id
local boot_id
local run_dir
blitz_load_boot_env
blitz_prepare_runtime_dir
blitz_prepare_run_root
run_id="$(blitz_new_run_id)"
boot_id="$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || blitz_new_run_id)"
run_dir="${BLITZ_RUN_ROOT}/runs/${run_id}"
export BLITZ_RUN_ID="${run_id}"
export BLITZ_RUN_DIR="${run_dir}"
export BLITZ_BOOT_ID="${boot_id}"
blitz_write_run_context "${run_id}" "${run_dir}" "${boot_id}"
blitz_log "run-context" "init" "success" "run_id=${run_id} run_dir=${run_dir}" 0
}
blitz_require_run_context() {
blitz_load_boot_env
if blitz_load_run_context_env; then
return 0
fi
blitz_log "run-context" "load" "failure" "missing ${BLITZ_RUN_CONTEXT_FILE}" 1
return 1
}
blitz_ensure_instance_id() {
if [[ -n "${BLITZ_INSTANCE_ID:-}" ]]; then
return 0
fi
export BLITZ_INSTANCE_ID="$(blitz_new_instance_id)"
}
blitz_jsonl_rotate_if_needed() {
local path="$1"
local max_bytes="${2:-${BLITZ_JSONL_ROTATE_BYTES:-0}}"
local max_files="${3:-${BLITZ_JSONL_ROTATE_FILES:-0}}"
local size=0
local index
if [[ -z "${path}" || ! -f "${path}" ]]; then
return 0
fi
if (( max_bytes <= 0 || max_files <= 0 )); then
return 0
fi
size="$(stat -c %s "${path}" 2>/dev/null || echo 0)"
if (( size < max_bytes )); then
return 0
fi
for (( index=max_files; index>=1; index-- )); do
if [[ "${index}" -eq "${max_files}" ]]; then
rm -f "${path}.${index}"
fi
if [[ -f "${path}.${index}" ]]; then
mv -f "${path}.${index}" "${path}.$(( index + 1 ))"
fi
done
mv -f "${path}" "${path}.1"
}
blitz_jsonl_append_line() {
local path="$1"
local line="$2"
mkdir -p "$(dirname "${path}")"
blitz_jsonl_rotate_if_needed "${path}"
printf '%s\n' "${line}" >> "${path}"
}
blitz_launch_incident_capture() {
local launch_script="${BOOT_SCRIPT_DIR}/blitz-incident-capture-launch.sh"
if [[ ! -f "${launch_script}" ]]; then
return 1
fi
/bin/bash "${launch_script}" "$@" >/dev/null 2>&1 || return 1
}

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="disable"
SYSTEMD_DEST_DIR="/etc/systemd/system"
UNITS=(
"blitz-watchdog.service"
"blitz-5g-link-logger.service"
"blitz-b-side-omnid.service"
"blitz-ros-receiver.service"
"blitz-5g-dial.service"
"blitz-run-context.service"
"blitz-boot-gate.service"
"blitz-robot.target"
)
stop_unit_if_present() {
local unit_name="$1"
local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}"
if [[ ! -f "${unit_path}" ]]; then
return 0
fi
blitz_run "${STEP}" "stop-unit" systemctl stop "${unit_name}" || true
}
disable_unit_if_present() {
local unit_name="$1"
local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}"
if [[ ! -f "${unit_path}" ]]; then
return 0
fi
blitz_run "${STEP}" "disable-unit" systemctl disable "${unit_name}" || true
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
for unit_name in "${UNITS[@]}"; do
stop_unit_if_present "${unit_name}"
done
for unit_name in "${UNITS[@]}"; do
disable_unit_if_present "${unit_name}"
done
blitz_log "${STEP}" "complete" "success" "boot chain stopped and disabled; next reboot will not auto-start blitz services" 0

View File

@@ -30,6 +30,19 @@ install_unit() {
blitz_log "install" "install-unit" "success" "unit=${SYSTEMD_DEST_DIR}/${template_name%.in}" 0 blitz_log "install" "install-unit" "success" "unit=${SYSTEMD_DEST_DIR}/${template_name%.in}" 0
} }
remove_unit_if_present() {
local unit_name="$1"
local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}"
if [[ ! -f "${unit_path}" ]]; then
return 0
fi
systemctl disable --now "${unit_name}" >/dev/null 2>&1 || true
rm -f "${unit_path}"
blitz_log "install" "remove-unit" "success" "unit=${unit_path}" 0
}
blitz_load_boot_env blitz_load_boot_env
blitz_require_root "install" blitz_require_root "install"
blitz_require_command install "install" blitz_require_command install "install"
@@ -40,13 +53,18 @@ install -d -m 0755 "$(dirname "${BLITZ_LOG_FILE}")"
touch "${BLITZ_LOG_FILE}" touch "${BLITZ_LOG_FILE}"
chmod 0644 "${BLITZ_LOG_FILE}" chmod 0644 "${BLITZ_LOG_FILE}"
blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0 blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0
blitz_prepare_runtime_dir
blitz_prepare_run_root
install_unit "blitz-boot-gate.service.in" install_unit "blitz-boot-gate.service.in"
install_unit "blitz-run-context.service.in"
install_unit "blitz-5g-dial.service.in" install_unit "blitz-5g-dial.service.in"
install_unit "blitz-time-sync.service.in" install_unit "blitz-5g-link-logger.service.in"
install_unit "blitz-ros-receiver.service.in" install_unit "blitz-ros-receiver.service.in"
install_unit "blitz-b-side-omnid.service.in" install_unit "blitz-b-side-omnid.service.in"
install_unit "blitz-watchdog.service.in"
install_unit "blitz-robot.target.in" install_unit "blitz-robot.target.in"
remove_unit_if_present "blitz-time-sync.service"
blitz_run "install" "daemon-reload" systemctl daemon-reload blitz_run "install" "daemon-reload" systemctl daemon-reload
blitz_run "install" "enable-target" systemctl enable blitz-robot.target blitz_run "install" "enable-target" systemctl enable blitz-robot.target

View File

@@ -1,9 +1,9 @@
{ {
"interface": "enx78886c7fbd46", "interface": "enxb8f72c9e179a",
"ipv4": [ "ipv4": [
"192.168.225.62/22" "192.168.225.160/22"
], ],
"ipv6": [ "ipv6": [
"fe80::a335:b50d:622d:92e8/64" "fe80::52ae:a1c8:a9bb:a9a8/64"
] ]
} }

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="runtime-dir"
blitz_load_boot_env
blitz_prepare_runtime_dir
blitz_log "${STEP}" "complete" "success" "runtime_dir=${BLITZ_RUNTIME_DIR}" 0

View File

@@ -2,7 +2,12 @@
# Override machine-specific values in robot-boot.env.local. # Override machine-specific values in robot-boot.env.local.
BLITZ_BOOT_DELAY_SEC="30" BLITZ_BOOT_DELAY_SEC="30"
BLITZ_RUN_ROOT="/var/log/blitz-robot"
BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log" BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log"
BLITZ_RUNTIME_DIR="/run/blitz-robot"
BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUNTIME_DIR}/run-context.env"
BLITZ_RUN_ID_FILE="${BLITZ_RUNTIME_DIR}/run-id"
BLITZ_CURRENT_RUN_LINK="${BLITZ_RUN_ROOT}/current"
BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot" BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot"
BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2" BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2"
@@ -18,13 +23,38 @@ BLITZ_5G_ROUTE_WAIT_SEC="30"
# Leave empty to fall back to the host part of ROBOT_SIDE_OMNISOCKET_SERVER_ADDR. # Leave empty to fall back to the host part of ROBOT_SIDE_OMNISOCKET_SERVER_ADDR.
BLITZ_TIME_SERVER_IP="81.70.156.140" BLITZ_TIME_SERVER_IP="81.70.156.140"
BLITZ_TIME_SERVER_PORT="10910"
BLITZ_TIME_SYNC_WAIT_SEC="30"
BLITZ_TIME_SYNC_MAX_OFFSET_SEC="0.002"
BLITZ_TIME_SYNC_INTERVAL_SEC="1"
BLITZ_ROS_USER="nvidia" BLITZ_ROS_USER="nvidia"
BLITZ_ROS_SOCKET_WAIT_SEC="20" BLITZ_ROS_SOCKET_WAIT_SEC="20"
BLITZ_WATCHDOG_INTERVAL_SEC="5"
BLITZ_HEALTH_STALE_SEC="15"
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
BLITZ_KCP_STATS_INTERVAL_MS="1000"
BLITZ_CONTROL_LATENCY_LOG_ENABLED="1"
BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="100"
BLITZ_CONTROL_ACK_SAMPLE_MOD="10"
BLITZ_VIDEO_STAGE_LOG_ENABLED="1"
BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD="10"
BLITZ_5G_LINK_LOG_INTERVAL_SEC="5"
BLITZ_JSONL_FLUSH_INTERVAL_MS="1000"
BLITZ_JSONL_FLUSH_BYTES="262144"
BLITZ_JSONL_ROTATE_BYTES="134217728"
BLITZ_JSONL_ROTATE_FILES="8"
# Log one normal relay packet out of every N packets. Drop events still log immediately.
OMNI_RELAY_PACKET_LOG_SAMPLE_EVERY="200"
BLITZ_INCIDENT_COMMAND_TIMEOUT_SEC="5"
BLITZ_INCIDENT_TOTAL_TIMEOUT_SEC="30"
BLITZ_NETWORK_FAIL_THRESHOLD="3"
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
BLITZ_GPS_MONITOR_ENABLED="1"
BLITZ_GPS_DEVICE_GLOB="/dev/ttyCH341USB*"
BLITZ_GPS_CHECK_INTERVAL_SEC="10"
BLITZ_GPS_RESTART_UNITS="gpsd.socket gpsd.service"
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
OMNI_CAMERA_DEVICE="/dev/v4l/by-path/platform-a80aa10000.usb-usb-0:3.2:1.4-video-index0"
# Boot units run b_side_omnid as root directly, so nested sudo must stay off. # Boot units run b_side_omnid as root directly, so nested sudo must stay off.
B_SIDE_OMNID_USE_SUDO="0" B_SIDE_OMNID_USE_SUDO="0"
OMNI_CONTROL_ACK_PEER_ID="peer-b-ctrl-ack"
OMNI_CONTROL_ACK_TARGET_PEER="peer-a-ctrl-ack"

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="5g-link-logger-service"
blitz_load_boot_env
blitz_require_run_context
export OMNI_BOOT_MODE="1"
export BLITZ_INSTANCE_ID="${BLITZ_INSTANCE_ID:-$(blitz_new_instance_id)}"
export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-${BLITZ_RUN_DIR}/b-5g-link-quality.${BLITZ_INSTANCE_ID}.jsonl}"
blitz_log "${STEP}" "start" "start" "exec bash ${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh" 0
exec bash "${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh"

View File

@@ -8,6 +8,7 @@ source "${SCRIPT_DIR}/common.sh"
STEP="b-side-omnid" STEP="b-side-omnid"
blitz_load_boot_env blitz_load_boot_env
blitz_require_run_context
blitz_require_executable "${OMNISOCKETGO_ROOT}/bin/b_side_omnid" "${STEP}" blitz_require_executable "${OMNISOCKETGO_ROOT}/bin/b_side_omnid" "${STEP}"

View File

@@ -8,6 +8,7 @@ source "${SCRIPT_DIR}/common.sh"
STEP="ros-receiver" STEP="ros-receiver"
blitz_load_boot_env blitz_load_boot_env
blitz_require_run_context
blitz_require_file "/opt/ros/${ROS_DISTRO}/setup.bash" "${STEP}" blitz_require_file "/opt/ros/${ROS_DISTRO}/setup.bash" "${STEP}"
blitz_require_file "${ROS_CONTROL_PY_DIR}/install/setup.bash" "${STEP}" blitz_require_file "${ROS_CONTROL_PY_DIR}/install/setup.bash" "${STEP}"

View File

@@ -1,7 +1,8 @@
[Unit] [Unit]
Description=Blitz robot 5G dial Description=Blitz robot 5G dial
After=blitz-boot-gate.service PartOf=blitz-robot.target
Requires=blitz-boot-gate.service After=blitz-run-context.service
Requires=blitz-run-context.service
[Service] [Service]
Type=oneshot Type=oneshot

View File

@@ -0,0 +1,19 @@
[Unit]
Description=Blitz robot 5G link logger
PartOf=blitz-robot.target
After=blitz-run-context.service blitz-5g-dial.service
Requires=blitz-run-context.service
Wants=blitz-run-context.service blitz-5g-dial.service
[Service]
Type=simple
EnvironmentFile=-/run/blitz-robot/run-context.env
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-5g-link-logger-service.sh
Restart=always
RestartSec=5
StandardOutput=append:@BLITZ_LOG_FILE@
StandardError=append:@BLITZ_LOG_FILE@
[Install]
WantedBy=blitz-robot.target

View File

@@ -1,11 +1,16 @@
[Unit] [Unit]
Description=Blitz robot b-side omnid Description=Blitz robot b-side omnid
After=blitz-time-sync.service blitz-ros-receiver.service PartOf=blitz-robot.target
Wants=blitz-time-sync.service blitz-ros-receiver.service After=blitz-run-context.service blitz-5g-dial.service blitz-ros-receiver.service
Requires=blitz-run-context.service
Wants=blitz-run-context.service blitz-5g-dial.service blitz-ros-receiver.service
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=-/run/blitz-robot/run-context.env
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh
ExecStopPost=/bin/bash -lc 'if [[ "${SERVICE_RESULT:-success}" != "success" ]]; then exec /bin/bash "@OMNISOCKETGO_ROOT@/scripts/boot/blitz-incident-capture-launch.sh" --source exec-stop-post --unit "%n" --result "${SERVICE_RESULT:-}" --exit-status "${EXIT_STATUS:-}" --reason b-side-service-exit; fi'
Restart=always Restart=always
RestartSec=2 RestartSec=2
StandardOutput=append:@BLITZ_LOG_FILE@ StandardOutput=append:@BLITZ_LOG_FILE@

View File

@@ -1,5 +1,6 @@
[Unit] [Unit]
Description=Blitz robot boot gate Description=Blitz robot boot gate
PartOf=blitz-robot.target
After=multi-user.target network-online.target After=multi-user.target network-online.target
Wants=network-online.target Wants=network-online.target

View File

@@ -1,10 +1,12 @@
[Unit] [Unit]
Description=Blitz robot boot chain Description=Blitz robot boot chain
Wants=blitz-boot-gate.service Wants=blitz-boot-gate.service
Wants=blitz-run-context.service
Wants=blitz-5g-dial.service Wants=blitz-5g-dial.service
Wants=blitz-time-sync.service Wants=blitz-5g-link-logger.service
Wants=blitz-ros-receiver.service Wants=blitz-ros-receiver.service
Wants=blitz-b-side-omnid.service Wants=blitz-b-side-omnid.service
Wants=blitz-watchdog.service
After=multi-user.target After=multi-user.target
[Install] [Install]

View File

@@ -1,13 +1,19 @@
[Unit] [Unit]
Description=Blitz robot ROS receiver Description=Blitz robot ROS receiver
After=blitz-time-sync.service PartOf=blitz-robot.target
Wants=blitz-time-sync.service After=blitz-run-context.service blitz-5g-dial.service
Requires=blitz-run-context.service
Wants=blitz-run-context.service blitz-5g-dial.service
[Service] [Service]
Type=simple Type=simple
User=@BLITZ_ROS_USER@ User=@BLITZ_ROS_USER@
PermissionsStartOnly=true
EnvironmentFile=-/run/blitz-robot/run-context.env
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh
ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver
ExecStopPost=/bin/bash -lc 'if [[ "${SERVICE_RESULT:-success}" != "success" ]]; then exec /bin/bash "@OMNISOCKETGO_ROOT@/scripts/boot/blitz-incident-capture-launch.sh" --source exec-stop-post --unit "%n" --result "${SERVICE_RESULT:-}" --exit-status "${EXIT_STATUS:-}" --reason ros-service-exit; fi'
Restart=always Restart=always
RestartSec=2 RestartSec=2
StandardOutput=append:@BLITZ_LOG_FILE@ StandardOutput=append:@BLITZ_LOG_FILE@

View File

@@ -0,0 +1,15 @@
[Unit]
Description=Blitz robot run context
PartOf=blitz-robot.target
After=blitz-boot-gate.service
Requires=blitz-boot-gate.service
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-run-context.sh
StandardOutput=append:@BLITZ_LOG_FILE@
StandardError=append:@BLITZ_LOG_FILE@
[Install]
WantedBy=blitz-robot.target

View File

@@ -1,14 +0,0 @@
[Unit]
Description=Blitz robot private chrony sync
After=blitz-5g-dial.service
Wants=blitz-5g-dial.service
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/time-sync.sh
StandardOutput=append:@BLITZ_LOG_FILE@
StandardError=append:@BLITZ_LOG_FILE@
[Install]
WantedBy=blitz-robot.target

View File

@@ -0,0 +1,19 @@
[Unit]
Description=Blitz robot health watchdog
PartOf=blitz-robot.target
After=blitz-run-context.service blitz-b-side-omnid.service blitz-ros-receiver.service
Requires=blitz-run-context.service
Wants=blitz-run-context.service blitz-b-side-omnid.service blitz-ros-receiver.service
[Service]
Type=simple
EnvironmentFile=-/run/blitz-robot/run-context.env
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-watchdog.sh
Restart=always
RestartSec=5
StandardOutput=append:@BLITZ_LOG_FILE@
StandardError=append:@BLITZ_LOG_FILE@
[Install]
WantedBy=blitz-robot.target

View File

@@ -1,114 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="time-sync"
CHRONY_SOURCES_DIR="/etc/chrony/sources.d"
CHRONY_SOURCE_FILE="${CHRONY_SOURCES_DIR}/blitz-robot.sources"
CHRONY_MAIN_CONF="/etc/chrony/chrony.conf"
CHRONY_MAIN_CONF_BAK="/etc/chrony/chrony.conf.blitz-bak"
CHRONY_BURST_SAMPLES="${CHRONY_BURST_SAMPLES:-1/2}"
chrony_unit_name() {
if systemctl list-unit-files chrony.service --no-legend 2>/dev/null | grep -q '^chrony\.service'; then
printf '%s\n' "chrony.service"
return 0
fi
if systemctl list-unit-files chronyd.service --no-legend 2>/dev/null | grep -q '^chronyd\.service'; then
printf '%s\n' "chronyd.service"
return 0
fi
printf '%s\n' "chrony.service"
}
ensure_chrony_main_conf() {
local temp_file
blitz_require_file "${CHRONY_MAIN_CONF}" "${STEP}"
mkdir -p "${CHRONY_SOURCES_DIR}"
if [[ ! -f "${CHRONY_MAIN_CONF_BAK}" ]]; then
cp -a "${CHRONY_MAIN_CONF}" "${CHRONY_MAIN_CONF_BAK}"
blitz_log "${STEP}" "backup-config" "success" "backup=${CHRONY_MAIN_CONF_BAK}" 0
fi
temp_file="$(mktemp)"
awk '
/^[[:space:]]*#/ { print; next }
/^[[:space:]]*(pool|server)[[:space:]]+/ {
print "# blitz-managed-disabled " $0
next
}
{ print }
' "${CHRONY_MAIN_CONF}" > "${temp_file}"
if ! grep -Eq '^[[:space:]]*sourcedir[[:space:]]+/etc/chrony/sources\.d([[:space:]]|$)' "${temp_file}"; then
printf '\n# blitz-managed\nsourcedir /etc/chrony/sources.d\n' >> "${temp_file}"
fi
if ! cmp -s "${temp_file}" "${CHRONY_MAIN_CONF}"; then
cp "${temp_file}" "${CHRONY_MAIN_CONF}"
blitz_log "${STEP}" "rewrite-main-config" "success" "commented non-Blitz pool/server entries in ${CHRONY_MAIN_CONF}" 0
else
blitz_log "${STEP}" "rewrite-main-config" "success" "main config already matches Blitz expectations" 0
fi
rm -f "${temp_file}"
}
write_chrony_source_file() {
local temp_file
temp_file="$(mktemp)"
cat <<EOF > "${temp_file}"
# blitz-managed
server ${BLITZ_TIME_SERVER_IP} port ${BLITZ_TIME_SERVER_PORT} iburst
EOF
if [[ ! -f "${CHRONY_SOURCE_FILE}" ]] || ! cmp -s "${temp_file}" "${CHRONY_SOURCE_FILE}"; then
cp "${temp_file}" "${CHRONY_SOURCE_FILE}"
blitz_log "${STEP}" "write-source" "success" "source_file=${CHRONY_SOURCE_FILE} server=${BLITZ_TIME_SERVER_IP} port=${BLITZ_TIME_SERVER_PORT}" 0
else
blitz_log "${STEP}" "write-source" "success" "source_file already matches ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT}" 0
fi
rm -f "${temp_file}"
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
blitz_require_command chronyc "${STEP}"
if [[ -z "${BLITZ_TIME_SERVER_IP}" ]]; then
blitz_log "${STEP}" "precheck" "failure" "BLITZ_TIME_SERVER_IP is empty and no fallback could be derived" 1
exit 1
fi
if ! [[ "${BLITZ_TIME_SERVER_PORT}" =~ ^[0-9]+$ ]] || (( BLITZ_TIME_SERVER_PORT < 1 || BLITZ_TIME_SERVER_PORT > 65535 )); then
blitz_log "${STEP}" "precheck" "failure" "BLITZ_TIME_SERVER_PORT must be an integer between 1 and 65535" 1
exit 1
fi
ensure_chrony_main_conf
write_chrony_source_file
CHRONY_UNIT="$(chrony_unit_name)"
blitz_run "${STEP}" "restart-chrony" systemctl restart "${CHRONY_UNIT}"
blitz_run "${STEP}" "burst" chronyc burst "${CHRONY_BURST_SAMPLES}"
blitz_log "${STEP}" "waitsync" "start" "server=${BLITZ_TIME_SERVER_IP} port=${BLITZ_TIME_SERVER_PORT} wait_sec=${BLITZ_TIME_SYNC_WAIT_SEC} max_offset_sec=${BLITZ_TIME_SYNC_MAX_OFFSET_SEC} interval_sec=${BLITZ_TIME_SYNC_INTERVAL_SEC}" 0
if chronyc waitsync "${BLITZ_TIME_SYNC_WAIT_SEC}" "${BLITZ_TIME_SYNC_MAX_OFFSET_SEC}" 1000 "${BLITZ_TIME_SYNC_INTERVAL_SEC}"; then
blitz_log "${STEP}" "waitsync" "success" "chrony synchronized to ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT}" 0
else
rc=$?
blitz_log "${STEP}" "waitsync" "soft_fail" "chrony did not synchronize to ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT} within the configured timeout" "${rc}"
fi
blitz_log "${STEP}" "tracking" "start" "chronyc tracking" 0
chronyc tracking || true
blitz_log "${STEP}" "sources" "start" "chronyc sources -v" 0
chronyc sources -v || true
blitz_log "${STEP}" "complete" "success" "time-sync step finished" 0

View File

@@ -0,0 +1,292 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
from datetime import datetime, timezone
import html
import json
from pathlib import Path
from typing import Any
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Aggregate run logs into control/video latency estimate outputs.")
parser.add_argument("--run-dir", required=True, help="Run directory containing JSONL logs.")
parser.add_argument("--output-dir", help="Output directory. Defaults to --run-dir.")
return parser.parse_args()
def iter_jsonl(path: Path) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
if not path.exists():
return records
with path.open("r", encoding="utf-8") as handle:
for raw_line in handle:
line = raw_line.strip()
if not line:
continue
try:
payload = json.loads(line)
except json.JSONDecodeError:
continue
if isinstance(payload, dict):
records.append(payload)
return records
def load_glob_jsonl(run_dir: Path, pattern: str) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
for path in sorted(run_dir.glob(pattern)):
records.extend(iter_jsonl(path))
return records
def write_jsonl(path: Path, records: list[dict[str, Any]]) -> None:
with path.open("w", encoding="utf-8") as handle:
for record in records:
handle.write(json.dumps(record, ensure_ascii=False, separators=(",", ":")))
handle.write("\n")
def parse_unix_ms(value: Any) -> int | None:
if value is None:
return None
if isinstance(value, (int, float)):
return int(value)
text = str(value).strip()
if not text:
return None
if text.endswith("Z"):
text = f"{text[:-1]}+00:00"
try:
return int(datetime.fromisoformat(text).astimezone(timezone.utc).timestamp() * 1000)
except ValueError:
return None
def flatten_net_epoch(samples: list[dict[str, Any]]) -> list[dict[str, Any]]:
flattened: list[dict[str, Any]] = []
for sample in samples:
links = sample.get("links") or {}
a_to_d = (links.get("a_to_d") or {}).get("sessions") or {}
d_to_b = (links.get("d_to_b") or {}).get("sessions") or {}
a_control = (a_to_d.get("control") or {}).get("kcp") or {}
d_control = (d_to_b.get("control") or {}).get("kcp") or {}
a_video = (a_to_d.get("video") or {}).get("kcp") or {}
d_video = (d_to_b.get("video") or {}).get("kcp") or {}
flattened.append(
{
"updated_at": sample.get("updated_at"),
"a_to_d_control_srtt_ms": a_control.get("srtt_ms"),
"a_to_d_control_min_srtt_ms": a_control.get("min_srtt_ms"),
"d_to_b_control_srtt_ms": d_control.get("srtt_ms"),
"d_to_b_control_min_srtt_ms": d_control.get("min_srtt_ms"),
"a_to_d_video_srtt_ms": a_video.get("srtt_ms"),
"a_to_d_video_min_srtt_ms": a_video.get("min_srtt_ms"),
"d_to_b_video_srtt_ms": d_video.get("srtt_ms"),
"d_to_b_video_min_srtt_ms": d_video.get("min_srtt_ms"),
"a_to_d_control_feedback_age_ms": a_control.get("last_feedback_age_ms"),
"d_to_b_control_feedback_age_ms": d_control.get("last_feedback_age_ms"),
"a_to_d_video_feedback_age_ms": a_video.get("last_feedback_age_ms"),
"d_to_b_video_feedback_age_ms": d_video.get("last_feedback_age_ms"),
"a_to_d_control_retrans_delta": ((a_to_d.get("control") or {}).get("trend") or {}).get("retrans_delta"),
"d_to_b_control_retrans_delta": ((d_to_b.get("control") or {}).get("trend") or {}).get("retrans_delta"),
"a_to_d_video_retrans_delta": ((a_to_d.get("video") or {}).get("trend") or {}).get("retrans_delta"),
"d_to_b_video_retrans_delta": ((d_to_b.get("video") or {}).get("trend") or {}).get("retrans_delta"),
"a_to_d_video_window_pressure_pct": a_video.get("window_pressure_pct"),
"d_to_b_video_window_pressure_pct": d_video.get("window_pressure_pct"),
"robot_health": sample.get("robot_health"),
}
)
return flattened
def aggregate_control_estimates(
network_samples: list[dict[str, Any]],
control_events: list[dict[str, Any]],
control_acks: list[dict[str, Any]],
) -> list[dict[str, Any]]:
if control_acks:
return control_acks
fallback: list[dict[str, Any]] = []
for sample in network_samples:
estimate = sample.get("latency_estimate") or {}
fallback.append(
{
"updated_at": sample.get("updated_at"),
"estimate_method": "srtt_fallback",
"control_loop_rtt_ms": estimate.get("control_loop_rtt_ms"),
"control_to_persist_est_ms": estimate.get("control_to_persist_est_ms"),
"control_oneway_srtt_est_ms": estimate.get("control_oneway_srtt_est_ms"),
"control_oneway_bestcase_est_ms": estimate.get("control_oneway_bestcase_est_ms"),
"source_event_count": len(control_events),
}
)
return fallback
def aggregate_video_estimates(
network_samples: list[dict[str, Any]],
frame_recv_records: list[dict[str, Any]],
display_probe_records: list[dict[str, Any]],
) -> list[dict[str, Any]]:
network_timeline = sorted(
(
(updated_at_ms, sample.get("latency_estimate") or {})
for sample in network_samples
for updated_at_ms in [parse_unix_ms(sample.get("updated_at"))]
if updated_at_ms is not None
),
key=lambda item: item[0],
)
probes_by_seq = {
int(record["frame_seq"]): record
for record in display_probe_records
if record.get("frame_seq") is not None
}
estimates: list[dict[str, Any]] = []
timeline_index = 0
for record in frame_recv_records:
frame_seq = record.get("frame_seq")
if frame_seq is None:
continue
probe = probes_by_seq.get(int(frame_seq))
backend_received_unix_ns = record.get("backend_received_unix_ns")
backend_received_unix_ms = None
try:
if backend_received_unix_ns is not None:
backend_received_unix_ms = int(int(backend_received_unix_ns) / 1_000_000)
except (TypeError, ValueError):
backend_received_unix_ms = None
latency_estimate: dict[str, Any] = {}
if backend_received_unix_ms is not None and network_timeline:
while timeline_index + 1 < len(network_timeline) and network_timeline[timeline_index + 1][0] <= backend_received_unix_ms:
timeline_index += 1
if network_timeline[timeline_index][0] <= backend_received_unix_ms:
latency_estimate = network_timeline[timeline_index][1]
network_oneway = latency_estimate.get("video_network_oneway_est_ms")
capture_to_send = record.get("b_side_capture_to_send_ms")
partial_est = None
if capture_to_send is not None or network_oneway is not None:
partial_est = round(float(capture_to_send or 0.0) + float(network_oneway or 0.0), 3)
request_to_paint_ms = None
if probe is not None and probe.get("request_to_paint_ms") is not None:
request_to_paint_ms = round(float(probe["request_to_paint_ms"]), 3)
elif probe is not None and probe.get("request_started_unix_ms") is not None and probe.get("paint_unix_ms") is not None:
request_to_paint_ms = round(float(probe["paint_unix_ms"]) - float(probe["request_started_unix_ms"]), 3)
video_e2e_est_ms = round(partial_est + request_to_paint_ms, 3) if partial_est is not None and request_to_paint_ms is not None else None
estimates.append(
{
"frame_seq": frame_seq,
"backend_received_unix_ns": record.get("backend_received_unix_ns"),
"frame_hash": record.get("frame_hash"),
"estimate_method": "capture_to_send+srtt/2+request_to_paint" if video_e2e_est_ms is not None else "capture_to_send+srtt/2",
"video_network_oneway_est_ms": network_oneway,
"b_side_capture_to_send_ms": capture_to_send,
"request_to_paint_ms": request_to_paint_ms,
"response_to_paint_ms": probe.get("response_to_paint_ms") if probe is not None else None,
"backend_to_request_ms": probe.get("backend_to_request_ms") if probe is not None else None,
"backend_to_request_ms_raw": probe.get("backend_to_request_ms_raw") if probe is not None else None,
"backend_to_paint_ms": probe.get("backend_to_paint_ms") if probe is not None else None,
"backend_to_paint_ms_raw": probe.get("backend_to_paint_ms_raw") if probe is not None else None,
"browser_backend_clock_offset_ms": probe.get("browser_backend_clock_offset_ms") if probe is not None else None,
"browser_backend_clock_rtt_ms": probe.get("browser_backend_clock_rtt_ms") if probe is not None else None,
"video_partial_est_ms": partial_est,
"video_e2e_est_ms": video_e2e_est_ms,
"sequence_gap": record.get("sequence_gap"),
"repeat_flag": record.get("repeat_flag"),
"sender_clock_delta_ms_raw": record.get("sender_clock_delta_ms_raw"),
}
)
return estimates
def write_html_summary(
path: Path,
*,
net_epochs: list[dict[str, Any]],
control_estimates: list[dict[str, Any]],
video_estimates: list[dict[str, Any]],
) -> None:
latest_control = control_estimates[-1] if control_estimates else {}
latest_video = video_estimates[-1] if video_estimates else {}
latest_net = net_epochs[-1] if net_epochs else {}
html_text = f"""<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Latency Estimates</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 24px; background: #0b1020; color: #eef2ff; }}
.grid {{ display: grid; grid-template-columns: repeat(3, minmax(0, 1fr)); gap: 16px; }}
.card {{ border: 1px solid #334155; border-radius: 8px; padding: 16px; background: #111827; }}
h1, h2 {{ margin-top: 0; }}
p {{ margin: 6px 0; line-height: 1.5; }}
code {{ color: #93c5fd; }}
</style>
</head>
<body>
<h1>Latency Estimates</h1>
<div class="grid">
<section class="card">
<h2>Control</h2>
<p><strong>loop RTT:</strong> {html.escape(str(latest_control.get("control_loop_rtt_ms")))}</p>
<p><strong>to persist:</strong> {html.escape(str(latest_control.get("control_to_persist_est_ms")))}</p>
<p><strong>method:</strong> {html.escape(str(latest_control.get("estimate_method")))}</p>
<p><strong>samples:</strong> {len(control_estimates)}</p>
</section>
<section class="card">
<h2>Video</h2>
<p><strong>network one-way:</strong> {html.escape(str(latest_video.get("video_network_oneway_est_ms")))}</p>
<p><strong>partial:</strong> {html.escape(str(latest_video.get("video_partial_est_ms")))}</p>
<p><strong>end-to-end:</strong> {html.escape(str(latest_video.get("video_e2e_est_ms")))}</p>
<p><strong>samples:</strong> {len(video_estimates)}</p>
</section>
<section class="card">
<h2>Net Epoch</h2>
<p><strong>a→d control srtt:</strong> {html.escape(str(latest_net.get("a_to_d_control_srtt_ms")))}</p>
<p><strong>d→b control srtt:</strong> {html.escape(str(latest_net.get("d_to_b_control_srtt_ms")))}</p>
<p><strong>a→d video srtt:</strong> {html.escape(str(latest_net.get("a_to_d_video_srtt_ms")))}</p>
<p><strong>d→b video srtt:</strong> {html.escape(str(latest_net.get("d_to_b_video_srtt_ms")))}</p>
</section>
</div>
</body>
</html>
"""
path.write_text(html_text, encoding="utf-8")
def main() -> int:
args = parse_args()
run_dir = Path(args.run_dir).resolve()
output_dir = Path(args.output_dir).resolve() if args.output_dir else run_dir
output_dir.mkdir(parents=True, exist_ok=True)
network_samples = load_glob_jsonl(run_dir, "a-network-summary.*.jsonl")
control_events = load_glob_jsonl(run_dir, "a-control-events.*.jsonl")
control_acks = load_glob_jsonl(run_dir, "a-control-acks.*.jsonl")
frame_recv_records = load_glob_jsonl(run_dir, "a-video-frame-recv.*.jsonl")
display_probe_records = load_glob_jsonl(run_dir, "a-video-display-probe.*.jsonl")
net_epochs = flatten_net_epoch(network_samples)
control_estimates = aggregate_control_estimates(network_samples, control_events, control_acks)
video_estimates = aggregate_video_estimates(network_samples, frame_recv_records, display_probe_records)
write_jsonl(output_dir / "net-epoch-summary.jsonl", net_epochs)
write_jsonl(output_dir / "control-latency-estimates.jsonl", control_estimates)
write_jsonl(output_dir / "video-latency-estimates.jsonl", video_estimates)
write_html_summary(
output_dir / "latency-estimates.html",
net_epochs=net_epochs,
control_estimates=control_estimates,
video_estimates=video_estimates,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,14 +1,31 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOAD_ENV_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEFAULT_OMNISOCKETGO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" DEFAULT_OMNISOCKETGO_ROOT="$(cd "${LOAD_ENV_SCRIPT_DIR}/../.." && pwd)"
die() { die() {
echo "$*" >&2 echo "$*" >&2
return 1 2>/dev/null || exit 1 return 1 2>/dev/null || exit 1
} }
normalize_loaded_env_vars() {
local var_name
local value
for var_name in $(compgen -A variable); do
case "${var_name}" in
BACKEND_*|BLITZ_*|B_SIDE_*|CONTROL_*|FRONTEND_*|OMNI_*|PYTHON3_BIN|PYTHON_VENV_PATH|ROBOT_*|ROS_DISTRO|VITE_*)
value="${!var_name}"
if [[ "${value}" == *$'\r' ]]; then
printf -v "${var_name}" '%s' "${value%$'\r'}"
export "${var_name}"
fi
;;
esac
done
}
is_omnisocketgo_root() { is_omnisocketgo_root() {
local dir="$1" local dir="$1"
[[ -f "${dir}/Makefile" && -f "${dir}/cmd/b_side_omnid.c" && -d "${dir}/ros-control-py" ]] [[ -f "${dir}/Makefile" && -f "${dir}/cmd/b_side_omnid.c" && -d "${dir}/ros-control-py" ]]
@@ -21,7 +38,7 @@ is_robot_command_center_root() {
require_robot_command_center_root() { require_robot_command_center_root() {
if ! is_robot_command_center_root "${ROBOT_COMMAND_CENTER_ROOT}"; then if ! is_robot_command_center_root "${ROBOT_COMMAND_CENTER_ROOT}"; then
die "ROBOT_COMMAND_CENTER_ROOT must point to the robot-command-center repo root. Current value: ${ROBOT_COMMAND_CENTER_ROOT}. Set it in ${SCRIPT_DIR}/robot-remote.env.local if needed." die "ROBOT_COMMAND_CENTER_ROOT must point to the robot-command-center repo root. Current value: ${ROBOT_COMMAND_CENTER_ROOT}. Set it in ${LOAD_ENV_SCRIPT_DIR}/robot-remote.env.local if needed."
fi fi
} }
@@ -55,8 +72,8 @@ if [[ "${OMNI_CAMERA_VERIFY+x}" == "x" ]]; then
fi fi
ENV_FILES=( ENV_FILES=(
"${SCRIPT_DIR}/robot-remote.env" "${LOAD_ENV_SCRIPT_DIR}/robot-remote.env"
"${SCRIPT_DIR}/robot-remote.env.local" "${LOAD_ENV_SCRIPT_DIR}/robot-remote.env.local"
) )
for env_file in "${ENV_FILES[@]}"; do for env_file in "${ENV_FILES[@]}"; do
@@ -68,6 +85,8 @@ for env_file in "${ENV_FILES[@]}"; do
fi fi
done done
normalize_loaded_env_vars
if [[ "${omni_camera_device_was_set}" == "1" ]]; then if [[ "${omni_camera_device_was_set}" == "1" ]]; then
export OMNI_CAMERA_DEVICE="${preserved_omni_camera_device}" export OMNI_CAMERA_DEVICE="${preserved_omni_camera_device}"
fi fi
@@ -105,7 +124,7 @@ export OMNISOCKET_TELEMETRY_INTERVAL_MS="${OMNISOCKET_TELEMETRY_INTERVAL_MS:-100
export OMNISOCKET_TELEMETRY_STALE_AFTER_MS="${OMNISOCKET_TELEMETRY_STALE_AFTER_MS:-3000}" export OMNISOCKET_TELEMETRY_STALE_AFTER_MS="${OMNISOCKET_TELEMETRY_STALE_AFTER_MS:-3000}"
export OMNI_NETWORK_SUMMARY_LOG_ENABLED="${OMNI_NETWORK_SUMMARY_LOG_ENABLED:-1}" export OMNI_NETWORK_SUMMARY_LOG_ENABLED="${OMNI_NETWORK_SUMMARY_LOG_ENABLED:-1}"
export OMNI_NETWORK_SUMMARY_LOG_PATH="${OMNI_NETWORK_SUMMARY_LOG_PATH:-${OMNISOCKETGO_ROOT}/logs/a-network-summary.jsonl}" export OMNI_NETWORK_SUMMARY_LOG_PATH="${OMNI_NETWORK_SUMMARY_LOG_PATH:-${OMNISOCKETGO_ROOT}/logs/a-network-summary.jsonl}"
export OMNI_NETWORK_SUMMARY_LOG_INTERVAL_MS="${OMNI_NETWORK_SUMMARY_LOG_INTERVAL_MS:-2000}" export OMNI_NETWORK_SUMMARY_LOG_INTERVAL_MS="${OMNI_NETWORK_SUMMARY_LOG_INTERVAL_MS:-1000}"
export OMNI_NETWORK_SUMMARY_LOG_REQUEST_TIMEOUT_SEC="${OMNI_NETWORK_SUMMARY_LOG_REQUEST_TIMEOUT_SEC:-3}" export OMNI_NETWORK_SUMMARY_LOG_REQUEST_TIMEOUT_SEC="${OMNI_NETWORK_SUMMARY_LOG_REQUEST_TIMEOUT_SEC:-3}"
export CONTROL_SIDE_OMNISOCKET_SERVER_ADDR="${CONTROL_SIDE_OMNISOCKET_SERVER_ADDR:-}" export CONTROL_SIDE_OMNISOCKET_SERVER_ADDR="${CONTROL_SIDE_OMNISOCKET_SERVER_ADDR:-}"
export CONTROL_SIDE_OMNISOCKET_RELAY_VIA="${CONTROL_SIDE_OMNISOCKET_RELAY_VIA:-}" export CONTROL_SIDE_OMNISOCKET_RELAY_VIA="${CONTROL_SIDE_OMNISOCKET_RELAY_VIA:-}"
@@ -127,9 +146,179 @@ export OMNI_CAMERA_PROFILE="${OMNI_CAMERA_PROFILE:-night}"
export OMNI_CAMERA_BRIGHTNESS="${OMNI_CAMERA_BRIGHTNESS:-}" export OMNI_CAMERA_BRIGHTNESS="${OMNI_CAMERA_BRIGHTNESS:-}"
export OMNI_CAMERA_CUSTOM_CTRL="${OMNI_CAMERA_CUSTOM_CTRL:-}" export OMNI_CAMERA_CUSTOM_CTRL="${OMNI_CAMERA_CUSTOM_CTRL:-}"
export OMNI_CAMERA_VERIFY="${OMNI_CAMERA_VERIFY:-0}" export OMNI_CAMERA_VERIFY="${OMNI_CAMERA_VERIFY:-0}"
export OMNI_GPSD_HOST="${OMNI_GPSD_HOST:-127.0.0.1}"
export OMNI_VIDEO_SERVER_ADDR="${OMNI_VIDEO_SERVER_ADDR:-${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}}" export OMNI_VIDEO_SERVER_ADDR="${OMNI_VIDEO_SERVER_ADDR:-${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}}"
export OMNI_VIDEO_RELAY_VIA="${OMNI_VIDEO_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}" export OMNI_VIDEO_RELAY_VIA="${OMNI_VIDEO_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}"
export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR:-${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}}" export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR:-${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR:-}}"
export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}" export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA:-${ROBOT_SIDE_OMNISOCKET_RELAY_VIA:-}}"
export OMNI_CONTROL_UNIX_SOCKET_PATH="${OMNI_CONTROL_UNIX_SOCKET_PATH:-${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}}" export OMNI_CONTROL_UNIX_SOCKET_PATH="${OMNI_CONTROL_UNIX_SOCKET_PATH:-${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}}"
export OMNI_CONTROL_ACK_PEER_ID="${OMNI_CONTROL_ACK_PEER_ID:-peer-b-ctrl-ack}"
export OMNI_CONTROL_ACK_TARGET_PEER="${OMNI_CONTROL_ACK_TARGET_PEER:-peer-a-ctrl-ack}"
export B_SIDE_OMNID_USE_SUDO="${B_SIDE_OMNID_USE_SUDO:-1}" export B_SIDE_OMNID_USE_SUDO="${B_SIDE_OMNID_USE_SUDO:-1}"
export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-${OMNISOCKETGO_ROOT}/logs/runtime}"
export BLITZ_RUN_ROOT="${BLITZ_RUN_ROOT:-${OMNISOCKETGO_ROOT}/logs}"
export BLITZ_RUN_CONTEXT_FILE="${BLITZ_RUN_CONTEXT_FILE:-${BLITZ_RUNTIME_DIR}/run-context.env}"
export BLITZ_RUN_ID_FILE="${BLITZ_RUN_ID_FILE:-${BLITZ_RUNTIME_DIR}/run-id}"
export BLITZ_CURRENT_RUN_LINK="${BLITZ_CURRENT_RUN_LINK:-${BLITZ_RUN_ROOT}/current}"
export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}"
export BLITZ_5G_MODEM_SUBNET="${BLITZ_5G_MODEM_SUBNET:-192.168.224.0/22}"
export BLITZ_5G_GATEWAY="${BLITZ_5G_GATEWAY:-192.168.225.1}"
export BLITZ_5G_ROUTE_TARGETS="${BLITZ_5G_ROUTE_TARGETS:-106.55.173.235}"
export BLITZ_5G_INFO_JSON="${BLITZ_5G_INFO_JSON:-${OMNISOCKETGO_ROOT}/scripts/boot/modem_network_info.json}"
export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-}"
export BLITZ_KCP_STATS_INTERVAL_MS="${BLITZ_KCP_STATS_INTERVAL_MS:-1000}"
export BLITZ_CONTROL_LATENCY_LOG_ENABLED="${BLITZ_CONTROL_LATENCY_LOG_ENABLED:-1}"
export BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD="${BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD:-100}"
export BLITZ_CONTROL_ACK_SAMPLE_MOD="${BLITZ_CONTROL_ACK_SAMPLE_MOD:-10}"
export BLITZ_VIDEO_STAGE_LOG_ENABLED="${BLITZ_VIDEO_STAGE_LOG_ENABLED:-1}"
export BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD="${BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD:-10}"
export BLITZ_5G_LINK_LOG_INTERVAL_SEC="${BLITZ_5G_LINK_LOG_INTERVAL_SEC:-5}"
export BLITZ_JSONL_FLUSH_INTERVAL_MS="${BLITZ_JSONL_FLUSH_INTERVAL_MS:-1000}"
export BLITZ_JSONL_FLUSH_BYTES="${BLITZ_JSONL_FLUSH_BYTES:-262144}"
export BLITZ_JSONL_ROTATE_BYTES="${BLITZ_JSONL_ROTATE_BYTES:-134217728}"
export BLITZ_JSONL_ROTATE_FILES="${BLITZ_JSONL_ROTATE_FILES:-8}"
blitz_dev_utc_compact_timestamp() {
date -u '+%Y%m%dT%H%M%SZ'
}
blitz_dev_git_commit() {
git -C "${OMNISOCKETGO_ROOT}" rev-parse HEAD 2>/dev/null || true
}
blitz_dev_git_dirty_flag() {
if git -C "${OMNISOCKETGO_ROOT}" diff --quiet --ignore-submodules=dirty >/dev/null 2>&1; then
printf '0\n'
return 0
fi
printf '1\n'
}
blitz_dev_prepare_dirs() {
mkdir -p "${BLITZ_RUNTIME_DIR}" "${BLITZ_RUN_ROOT}/runs" "${BLITZ_RUN_ROOT}/incidents"
}
blitz_dev_write_run_info() {
local run_dir="$1"
local run_id="$2"
local boot_id="$3"
local tmp_info="${run_dir}/run-info.json.tmp.$$"
local started_at
local commit_hash
local dirty_flag
started_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
commit_hash="$(blitz_dev_git_commit)"
dirty_flag="$(blitz_dev_git_dirty_flag)"
python3 - "${tmp_info}" "${run_id}" "${run_dir}" "${boot_id}" "${started_at}" "${commit_hash}" "${dirty_flag}" "${HOSTNAME:-$(hostname)}" <<'PY'
import json
import os
import sys
path, run_id, run_dir, boot_id, started_at, commit_hash, dirty_flag, hostname = sys.argv[1:9]
payload = {
"run_id": run_id,
"run_dir": run_dir,
"boot_id": boot_id,
"started_at": started_at,
"hostname": hostname,
"git_commit": commit_hash,
"git_dirty": dirty_flag == "1",
"env": {
key: os.environ.get(key, "")
for key in sorted(os.environ)
if key.startswith(("BLITZ_", "OMNI_", "ROBOT_RECEIVER_"))
},
}
with open(path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
PY
mv -f "${tmp_info}" "${run_dir}/run-info.json"
}
blitz_dev_init_run_context() {
local run_id="${1:-$(blitz_dev_utc_compact_timestamp)}"
local boot_id="dev-$(blitz_dev_utc_compact_timestamp)"
local run_dir="${BLITZ_RUN_ROOT}/runs/${run_id}"
local tmp_context="${BLITZ_RUN_CONTEXT_FILE}.tmp.$$"
blitz_dev_prepare_dirs
mkdir -p "${run_dir}"
export BLITZ_RUN_ID="${run_id}"
export BLITZ_RUN_DIR="${run_dir}"
export BLITZ_BOOT_ID="${boot_id}"
printf '%s\n' "${run_id}" > "${BLITZ_RUN_ID_FILE}"
cat > "${tmp_context}" <<EOF
BLITZ_RUN_ID=${run_id}
BLITZ_RUN_DIR=${run_dir}
BLITZ_BOOT_ID=${boot_id}
BLITZ_RUN_ROOT=${BLITZ_RUN_ROOT}
EOF
mv -f "${tmp_context}" "${BLITZ_RUN_CONTEXT_FILE}"
ln -sfn "${run_dir}" "${BLITZ_CURRENT_RUN_LINK}"
blitz_dev_write_run_info "${run_dir}" "${run_id}" "${boot_id}"
}
blitz_dev_load_run_context() {
if [[ ! -f "${BLITZ_RUN_CONTEXT_FILE}" ]]; then
return 1
fi
set -a
# shellcheck disable=SC1090
source "${BLITZ_RUN_CONTEXT_FILE}"
set +a
}
blitz_dev_ensure_run_context() {
if blitz_dev_load_run_context; then
return 0
fi
blitz_dev_init_run_context
}
blitz_dev_reset_run_context() {
rm -f "${BLITZ_RUN_CONTEXT_FILE}" "${BLITZ_RUN_ID_FILE}"
blitz_dev_init_run_context
}
blitz_dev_init_instance_context() {
if [[ -z "${BLITZ_INSTANCE_ID:-}" ]]; then
export BLITZ_INSTANCE_ID="$(blitz_dev_utc_compact_timestamp)-$$"
fi
}
blitz_dev_component_log_path() {
local stem="$1"
printf '%s/%s.%s.jsonl\n' "${BLITZ_RUN_DIR}" "${stem}" "${BLITZ_INSTANCE_ID}"
}
blitz_dev_prepare_backend_logging_env() {
blitz_dev_init_instance_context
if [[ "${OMNI_NETWORK_SUMMARY_LOG_PATH}" == "${OMNISOCKETGO_ROOT}/logs/a-network-summary.jsonl" ]]; then
export OMNI_NETWORK_SUMMARY_LOG_PATH
OMNI_NETWORK_SUMMARY_LOG_PATH="$(blitz_dev_component_log_path "a-network-summary")"
fi
export BLITZ_A_CONTROL_EVENTS_LOG_PATH="${BLITZ_A_CONTROL_EVENTS_LOG_PATH:-$(blitz_dev_component_log_path "a-control-events")}"
export BLITZ_A_CONTROL_ACKS_LOG_PATH="${BLITZ_A_CONTROL_ACKS_LOG_PATH:-$(blitz_dev_component_log_path "a-control-acks")}"
export BLITZ_A_VIDEO_FRAME_RECV_LOG_PATH="${BLITZ_A_VIDEO_FRAME_RECV_LOG_PATH:-$(blitz_dev_component_log_path "a-video-frame-recv")}"
export BLITZ_A_VIDEO_DISPLAY_PROBE_LOG_PATH="${BLITZ_A_VIDEO_DISPLAY_PROBE_LOG_PATH:-$(blitz_dev_component_log_path "a-video-display-probe")}"
}
blitz_dev_prepare_bside_logging_env() {
blitz_dev_init_instance_context
export BLITZ_KCP_STATS_LOG_PATH="${BLITZ_KCP_STATS_LOG_PATH:-$(blitz_dev_component_log_path "b-kcp-session-stats")}"
export BLITZ_CONTROL_LATENCY_LOG_PATH="${BLITZ_CONTROL_LATENCY_LOG_PATH:-$(blitz_dev_component_log_path "b-control-latency")}"
export BLITZ_VIDEO_STAGE_LOG_PATH="${BLITZ_VIDEO_STAGE_LOG_PATH:-$(blitz_dev_component_log_path "b-video-frame-stages")}"
}
blitz_dev_prepare_5g_logging_env() {
blitz_dev_init_instance_context
export BLITZ_5G_LINK_LOG_PATH="${BLITZ_5G_LINK_LOG_PATH:-$(blitz_dev_component_log_path "b-5g-link-quality")}"
}
if [[ "${BLITZ_SKIP_DEV_RUN_CONTEXT_INIT:-0}" != "1" && "${BLITZ_BOOT_LOADING_ENV:-0}" != "1" && "${OMNI_BOOT_MODE:-0}" != "1" ]]; then
blitz_dev_ensure_run_context
elif [[ -f "${BLITZ_RUN_CONTEXT_FILE}" ]]; then
blitz_dev_load_run_context || true
fi

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
export BLITZ_SKIP_DEV_RUN_CONTEXT_INIT="1"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/load-env.sh"
blitz_dev_reset_run_context
printf 'run_id=%s\nrun_dir=%s\n' "${BLITZ_RUN_ID}" "${BLITZ_RUN_DIR}"

View File

@@ -5,11 +5,13 @@
# Example: # Example:
# ROBOT_COMMAND_CENTER_ROOT="$HOME/Documents/robot-command-center" # ROBOT_COMMAND_CENTER_ROOT="$HOME/Documents/robot-command-center"
CONTROL_SIDE_OMNISOCKET_SERVER_ADDR="81.70.156.140:10909" CONTROL_SIDE_OMNISOCKET_SERVER_ADDR="81.70.156.140:10909" # D
CONTROL_SIDE_OMNISOCKET_RELAY_VIA="81.70.156.140:10909" CONTROL_SIDE_OMNISOCKET_RELAY_VIA="106.55.173.235:10909" # C
ROBOT_SIDE_OMNISOCKET_SERVER_ADDR="81.70.156.140:10909" ROBOT_SIDE_OMNISOCKET_SERVER_ADDR="81.70.156.140:10909" # D
ROBOT_SIDE_OMNISOCKET_RELAY_VIA="106.55.173.235:10909" ROBOT_SIDE_OMNISOCKET_RELAY_VIA="81.70.156.140:10909" # 直连 D
# Log one normal relay packet out of every N packets. Drop events still log immediately.
OMNI_RELAY_PACKET_LOG_SAMPLE_EVERY="200"
CONTROL_WS_ALLOWED_ORIGINS="http://127.0.0.1:5173,http://localhost:5173" CONTROL_WS_ALLOWED_ORIGINS="http://127.0.0.1:5173,http://localhost:5173"
VITE_API_BASE_URL="http://127.0.0.1:8001" VITE_API_BASE_URL="http://127.0.0.1:8001"
@@ -24,7 +26,7 @@ OMNISOCKET_TELEMETRY_INTERVAL_MS="1000"
OMNISOCKET_TELEMETRY_STALE_AFTER_MS="3000" OMNISOCKET_TELEMETRY_STALE_AFTER_MS="3000"
OMNI_NETWORK_SUMMARY_LOG_ENABLED="1" OMNI_NETWORK_SUMMARY_LOG_ENABLED="1"
OMNI_NETWORK_SUMMARY_LOG_PATH="${OMNISOCKETGO_ROOT}/logs/a-network-summary.jsonl" OMNI_NETWORK_SUMMARY_LOG_PATH="${OMNISOCKETGO_ROOT}/logs/a-network-summary.jsonl"
OMNI_NETWORK_SUMMARY_LOG_INTERVAL_MS="5000" OMNI_NETWORK_SUMMARY_LOG_INTERVAL_MS="1000"
OMNI_NETWORK_SUMMARY_LOG_REQUEST_TIMEOUT_SEC="3" OMNI_NETWORK_SUMMARY_LOG_REQUEST_TIMEOUT_SEC="3"
FRONTEND_HOST="0.0.0.0" FRONTEND_HOST="0.0.0.0"
@@ -44,22 +46,29 @@ ROBOT_RECEIVER_PUBLISH_RATE_HZ="100.0"
OMNI_VIDEO_PEER_ID="peer-b-video" OMNI_VIDEO_PEER_ID="peer-b-video"
OMNI_VIDEO_TARGET_PEER="peer-a-video" OMNI_VIDEO_TARGET_PEER="peer-a-video"
OMNI_CAMERA_DEVICE="/dev/video26" OMNI_GPSD_HOST="127.0.0.1"
OMNI_CAMERA_PROFILE="night" OMNI_CAMERA_DEVICE="/dev/v4l/by-path/platform-a80aa10000.usb-usb-0:3.2:1.4-video-index0"
OMNI_CAMERA_PROFILE="day"
OMNI_CAMERA_BRIGHTNESS="" OMNI_CAMERA_BRIGHTNESS=""
OMNI_CAMERA_CUSTOM_CTRL="" OMNI_CAMERA_CUSTOM_CTRL=""
OMNI_CAMERA_VERIFY="0" OMNI_CAMERA_VERIFY="0"
OMNI_VIDEO_SERVER_ADDR="${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR}" OMNI_VIDEO_SERVER_ADDR="${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR}"
OMNI_VIDEO_RELAY_VIA="${ROBOT_SIDE_OMNISOCKET_RELAY_VIA}" OMNI_VIDEO_RELAY_VIA="${ROBOT_SIDE_OMNISOCKET_RELAY_VIA}"
OMNI_VIDEO_SOFT_BACKPRESSURE_SEGMENTS="64" OMNI_VIDEO_SOFT_BACKPRESSURE_SEGMENTS="256"
OMNI_VIDEO_HARD_BACKPRESSURE_SEGMENTS="192" OMNI_VIDEO_HARD_BACKPRESSURE_SEGMENTS="1024"
OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS="1000" OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS="5000"
OMNI_VIDEO_FRAME_STALL_RECONNECT_MS="30000"
OMNI_CONTROL_PEER_ID="peer-b-ctrl" OMNI_CONTROL_PEER_ID="peer-b-ctrl"
OMNI_CONTROL_EXPECTED_SENDER="peer-a-ctrl" OMNI_CONTROL_EXPECTED_SENDER="peer-a-ctrl"
OMNI_CONTROL_SERVER_ADDR="${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR}" OMNI_CONTROL_SERVER_ADDR="${ROBOT_SIDE_OMNISOCKET_SERVER_ADDR}"
OMNI_CONTROL_RELAY_VIA="${ROBOT_SIDE_OMNISOCKET_RELAY_VIA}" OMNI_CONTROL_RELAY_VIA="${ROBOT_SIDE_OMNISOCKET_RELAY_VIA}"
OMNI_CONTROL_UNIX_SOCKET_PATH="${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" OMNI_CONTROL_UNIX_SOCKET_PATH="${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}"
OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS="3000" OMNI_CONTROL_ACK_PEER_ID="peer-b-ctrl-ack"
OMNI_CONTROL_ACK_TARGET_PEER="peer-a-ctrl-ack"
BLITZ_CONTROL_ACK_SAMPLE_MOD="10"
BLITZ_VIDEO_STAGE_LOG_ENABLED="1"
BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD="10"
OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS="30000"
# A-side backend video freshness guard. Used by scripts/dev/start-backend.sh. # A-side backend video freshness guard. Used by scripts/dev/start-backend.sh.
OMNI_VIDEO_MAX_FRAME_AGE_MS="1000" OMNI_VIDEO_MAX_FRAME_AGE_MS="1000"

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/load-env.sh"
blitz_dev_prepare_5g_logging_env
exec bash "${OMNISOCKETGO_ROOT}/scripts/boot/blitz-5g-link-logger.sh"

View File

@@ -4,6 +4,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091 # shellcheck disable=SC1091
source "${SCRIPT_DIR}/load-env.sh" source "${SCRIPT_DIR}/load-env.sh"
blitz_dev_prepare_bside_logging_env
cd "${OMNISOCKETGO_ROOT}" cd "${OMNISOCKETGO_ROOT}"
@@ -14,6 +15,24 @@ export OMNI_VIDEO_RELAY_VIA="${OMNI_VIDEO_RELAY_VIA}"
export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR}" export OMNI_CONTROL_SERVER_ADDR="${OMNI_CONTROL_SERVER_ADDR}"
export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA}" export OMNI_CONTROL_RELAY_VIA="${OMNI_CONTROL_RELAY_VIA}"
logger_pid=""
cleanup() {
if [[ -n "${logger_pid}" ]]; then
kill "${logger_pid}" 2>/dev/null || true
wait "${logger_pid}" 2>/dev/null || true
fi
}
start_5g_link_logger_if_needed() {
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
return 0
fi
bash "${SCRIPT_DIR}/start-5g-link-logger.sh" &
logger_pid=$!
echo "[start-b-side-omnid] 5G link logger -> ${BLITZ_5G_LINK_LOG_PATH:-unset}" >&2
}
if [[ ! -x "./bin/b_side_omnid" ]]; then if [[ ! -x "./bin/b_side_omnid" ]]; then
if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then if [[ "${OMNI_BOOT_MODE:-0}" == "1" ]]; then
echo "Missing ./bin/b_side_omnid in boot mode; build it before enabling the autostart service." >&2 echo "Missing ./bin/b_side_omnid in boot mode; build it before enabling the autostart service." >&2
@@ -23,12 +42,14 @@ if [[ ! -x "./bin/b_side_omnid" ]]; then
fi fi
launch_b_side_omnid() { launch_b_side_omnid() {
trap cleanup EXIT INT TERM
start_5g_link_logger_if_needed
bash "${SCRIPT_DIR}/apply-camera-controls.sh" bash "${SCRIPT_DIR}/apply-camera-controls.sh"
exec ./bin/b_side_omnid ./bin/b_side_omnid
} }
if [[ "${B_SIDE_OMNID_USE_SUDO}" == "1" && "${EUID}" -ne 0 ]]; then if [[ "${B_SIDE_OMNID_USE_SUDO}" == "1" && "${EUID}" -ne 0 ]]; then
exec sudo -E bash -lc 'cd "$1" && bash "$2" && exec "$3"' _ "${OMNISOCKETGO_ROOT}" "${SCRIPT_DIR}/apply-camera-controls.sh" "./bin/b_side_omnid" exec sudo -E bash -lc 'cd "$1" && export B_SIDE_OMNID_USE_SUDO=0 && exec bash "$2"' _ "${OMNISOCKETGO_ROOT}" "${SCRIPT_DIR}/start-b-side-omnid.sh"
fi fi
launch_b_side_omnid launch_b_side_omnid

View File

@@ -5,6 +5,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091 # shellcheck disable=SC1091
source "${SCRIPT_DIR}/load-env.sh" source "${SCRIPT_DIR}/load-env.sh"
require_robot_command_center_root require_robot_command_center_root
blitz_dev_prepare_backend_logging_env
if [[ ! -d "${PYTHON_VENV_PATH}" ]]; then if [[ ! -d "${PYTHON_VENV_PATH}" ]]; then
"${PYTHON3_BIN}" -m venv "${PYTHON_VENV_PATH}" "${PYTHON3_BIN}" -m venv "${PYTHON_VENV_PATH}"

View File

@@ -42,8 +42,19 @@ static gps_video_sample_t load_gps(void) {
return sample; return sample;
} }
// 将经纬度规范化为 float保留 6 位小数。 static void gps_sleep_before_retry(void) {
static int normalize_gps(float latitude, float longitude, gps_video_sample_t* sample) { int retry_ms = 1000;
int step_ms = 100;
int elapsed_ms = 0;
while (g_running && elapsed_ms < retry_ms) {
usleep((useconds_t) step_ms * 1000U);
elapsed_ms += step_ms;
}
}
// 将经纬度规范化为 double保留 6 位小数。
static int normalize_gps(double latitude, double longitude, gps_video_sample_t* sample) {
if (!isfinite(latitude) || !isfinite(longitude)) { if (!isfinite(latitude) || !isfinite(longitude)) {
return -1; return -1;
} }
@@ -150,76 +161,89 @@ static int is_tpv_class(const char* json) {
// ================================================================= // =================================================================
void* gps_update_thread(void* arg) { void* gps_update_thread(void* arg) {
const char* host = (const char*)arg; const char* host = (const char*)arg;
int sockfd; const char* gpsd_host = (host != NULL && host[0] != '\0') ? host : "127.0.0.1";
struct addrinfo hints, *res, *rp;
while (g_running) {
int sockfd = -1;
struct addrinfo hints;
struct addrinfo *res = NULL;
struct addrinfo *rp = NULL;
int s; int s;
char buffer[4096];
size_t offset = 0;
// 1. 解析地址并连接 gpsd (默认端口 2947) // 1. 解析地址并连接 gpsd (默认端口 2947)
memset(&hints, 0, sizeof(hints)); memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_UNSPEC; // 兼容 IPv4/IPv6 hints.ai_family = AF_UNSPEC; // 兼容 IPv4/IPv6
hints.ai_socktype = SOCK_STREAM; hints.ai_socktype = SOCK_STREAM;
s = getaddrinfo(host, "2947", &hints, &res); s = getaddrinfo(gpsd_host, "2947", &hints, &res);
if (s != 0) { if (s != 0) {
fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(s)); fprintf(stderr, "GPS线程: 解析 gpsd 地址失败 %s:2947: %s\n", gpsd_host, gai_strerror(s));
return NULL; gps_sleep_before_retry();
continue;
} }
// 尝试连接每一个解析出来的地址 // 尝试连接每一个解析出来的地址
for (rp = res; rp != NULL; rp = rp->ai_next) { for (rp = res; rp != NULL; rp = rp->ai_next) {
sockfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); sockfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if (sockfd == -1) continue; if (sockfd == -1) {
continue;
}
if (connect(sockfd, rp->ai_addr, rp->ai_addrlen) != -1) { if (connect(sockfd, rp->ai_addr, rp->ai_addrlen) != -1) {
break; // 成功连接 break;
} }
close(sockfd); close(sockfd);
sockfd = -1;
} }
if (rp == NULL) { // 没有连接成功
fprintf(stderr, "无法连接到 %s:2947\n", host);
freeaddrinfo(res);
return NULL;
}
freeaddrinfo(res); freeaddrinfo(res);
printf("GPS线程: 已连接到 gpsd %s\n", host); if (sockfd < 0) {
fprintf(stderr, "GPS线程: 无法连接到 %s:29471 秒后重试\n", gpsd_host);
gps_sleep_before_retry();
continue;
}
printf("GPS线程: 已连接到 gpsd %s\n", gpsd_host);
// 2. 发送 WATCH 命令,开启 JSON 流 // 2. 发送 WATCH 命令,开启 JSON 流
{
const char* watch_cmd = "?WATCH={\"enable\":true,\"json\":true};\n"; const char* watch_cmd = "?WATCH={\"enable\":true,\"json\":true};\n";
if (send(sockfd, watch_cmd, strlen(watch_cmd), 0) < 0) { if (send(sockfd, watch_cmd, strlen(watch_cmd), 0) < 0) {
perror("发送 WATCH 命令失败"); perror("GPS线程: 发送 WATCH 命令失败");
close(sockfd); close(sockfd);
return NULL; gps_sleep_before_retry();
continue;
}
} }
// 3. 主循环:读取并解析数据流 // 3. 主循环:读取并解析数据流
// 注意gpsd 数据是以 \n 结尾的,不能直接用固定长度 recv // 注意gpsd 数据是以 \n 结尾的,不能直接用固定长度 recv
char buffer[4096]; // 增大缓冲区以容纳长 JSON
size_t offset = 0; // 当前缓冲区数据长度
while (g_running) { while (g_running) {
ssize_t len = recv(sockfd, buffer + offset, sizeof(buffer) - 1 - offset, 0); ssize_t len = recv(sockfd, buffer + offset, sizeof(buffer) - 1 - offset, 0);
if (len <= 0) { if (len <= 0) {
// 连接断开,进行重连逻辑
break; break;
} }
offset += len; offset += (size_t) len;
buffer[offset] = '\0'; // 确保字符串结束 buffer[offset] = '\0'; // 确保字符串结束
// 查找换行符 \n因为一条完整的 JSON 消息以 \n 结尾 // 查找换行符 \n因为一条完整的 JSON 消息以 \n 结尾
char* start = buffer; char* start = buffer;
char* end; char* end;
while ((end = memchr(start, '\n', buffer + offset - start)) != NULL) { while ((end = memchr(start, '\n', (buffer + offset) - start)) != NULL) {
*end = '\0'; // 临时截断,形成独立字符串 *end = '\0'; // 临时截断,形成独立字符串
// --- 核心解析逻辑 --- // --- 核心解析逻辑 ---
// 1. 检查是否为 TPV 数据包 // 1. 检查是否为 TPV 数据包
if (is_tpv_class(start)) { if (is_tpv_class(start)) {
double lat = 0.0, lon = 0.0; double lat = 0.0;
double lon = 0.0;
int mode = 0; int mode = 0;
int has_fix = 0; int has_fix = 0;
@@ -253,9 +277,8 @@ void* gps_update_thread(void* arg) {
} }
// 处理完所有完整消息后,将剩余未处理的数据移到缓冲区头部 // 处理完所有完整消息后,将剩余未处理的数据移到缓冲区头部
// (这种情况很少见,但为了严谨性)
if (start < buffer + offset) { if (start < buffer + offset) {
size_t remaining = (buffer + offset) - start; size_t remaining = (size_t) ((buffer + offset) - start);
memmove(buffer, start, remaining); memmove(buffer, start, remaining);
offset = remaining; offset = remaining;
} else { } else {
@@ -263,12 +286,13 @@ void* gps_update_thread(void* arg) {
} }
} }
// 循环结束,清理资源
close(sockfd); close(sockfd);
printf("GPS线程: 连接断开,尝试重连...\n"); if (g_running) {
fprintf(stderr, "GPS线程: 连接断开1 秒后重连...\n");
gps_sleep_before_retry();
}
}
// 这里可以添加一个短暂的休眠防止重连风暴
// 但通常主程序会重启线程
return NULL; return NULL;
} }

View File

@@ -18,7 +18,7 @@ kcp_packet_debug_logger_t *kcp_packet_debug_open_jsonl(const char *path) {
fclose(file); fclose(file);
return NULL; return NULL;
} }
omni_file_logger_init(&logger->file_logger, file); omni_file_logger_init_path(&logger->file_logger, file, path, 0);
logger->enabled = 1; logger->enabled = 1;
return logger; return logger;
} }

View File

@@ -73,7 +73,7 @@ kcp_session_stats_logger_t *kcp_session_stats_open_jsonl(const char *path) {
fclose(file); fclose(file);
return NULL; return NULL;
} }
omni_file_logger_init(&logger->file_logger, file); omni_file_logger_init_path(&logger->file_logger, file, path, 0);
logger->enabled = 1; logger->enabled = 1;
return logger; return logger;
} }
@@ -156,10 +156,18 @@ int kcp_session_stats_log(kcp_session_stats_logger_t *logger, const kcp_session_
kcp_session_stats_appendf(&line, &line_len, ",\"srtt_ms\":%d", record->srtt_ms) != 0) { kcp_session_stats_appendf(&line, &line_len, ",\"srtt_ms\":%d", record->srtt_ms) != 0) {
goto cleanup; goto cleanup;
} }
if (record->has_min_srtt_ms &&
kcp_session_stats_appendf(&line, &line_len, ",\"min_srtt_ms\":%d", record->min_srtt_ms) != 0) {
goto cleanup;
}
if (record->has_srttvar_ms && if (record->has_srttvar_ms &&
kcp_session_stats_appendf(&line, &line_len, ",\"srttvar_ms\":%d", record->srttvar_ms) != 0) { kcp_session_stats_appendf(&line, &line_len, ",\"srttvar_ms\":%d", record->srttvar_ms) != 0) {
goto cleanup; goto cleanup;
} }
if (record->has_last_feedback_age_ms &&
kcp_session_stats_appendf(&line, &line_len, ",\"last_feedback_age_ms\":%u", record->last_feedback_age_ms) != 0) {
goto cleanup;
}
if (record->has_snd_wnd && if (record->has_snd_wnd &&
kcp_session_stats_appendf(&line, &line_len, ",\"snd_wnd\":%u", record->snd_wnd) != 0) { kcp_session_stats_appendf(&line, &line_len, ",\"snd_wnd\":%u", record->snd_wnd) != 0) {
goto cleanup; goto cleanup;

View File

@@ -32,7 +32,7 @@ latency_logger_t *latencylog_open_jsonl(const char *path) {
fclose(file); fclose(file);
return NULL; return NULL;
} }
omni_file_logger_init(&logger->file_logger, file); omni_file_logger_init_path(&logger->file_logger, file, path, 0);
logger->enabled = 1; logger->enabled = 1;
return logger; return logger;
} }

View File

@@ -544,9 +544,217 @@ const char *omni_path_base_name(const char *path) {
return slash == NULL ? path : slash + 1; return slash == NULL ? path : slash + 1;
} }
static uint64_t omni_now_monotonic_ms64(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t) ts.tv_sec * 1000ULL + (uint64_t) (ts.tv_nsec / 1000000L);
}
static int omni_positive_int_env(const char *name, int default_value) {
const char *raw = getenv(name);
long parsed;
char *endptr = NULL;
if (raw == NULL || raw[0] == '\0') {
return default_value;
}
parsed = strtol(raw, &endptr, 10);
if (endptr == raw || *endptr != '\0' || parsed <= 0) {
return default_value;
}
return (int) parsed;
}
static size_t omni_positive_size_env(const char *name, size_t default_value) {
const char *raw = getenv(name);
unsigned long long parsed;
char *endptr = NULL;
if (raw == NULL || raw[0] == '\0') {
return default_value;
}
parsed = strtoull(raw, &endptr, 10);
if (endptr == raw || *endptr != '\0' || parsed == 0ULL) {
return default_value;
}
return (size_t) parsed;
}
static int omni_file_logger_flush_locked(omni_file_logger_t *logger, uint64_t now_ms) {
if (logger == NULL || logger->file == NULL) {
errno = EINVAL;
return -1;
}
if (fflush(logger->file) != 0) {
return -1;
}
logger->buffered_bytes = 0U;
logger->last_flush_monotonic_ms = now_ms;
return 0;
}
static int omni_build_rotated_path(char *buffer, size_t buffer_len, const char *path, int suffix) {
size_t path_len;
int written;
if (buffer == NULL || buffer_len == 0U || path == NULL || path[0] == '\0') {
errno = EINVAL;
return -1;
}
path_len = strlen(path);
if (path_len + 16U >= buffer_len) {
errno = ENAMETOOLONG;
return -1;
}
memcpy(buffer, path, path_len);
written = snprintf(buffer + path_len, buffer_len - path_len, ".%d", suffix);
if (written < 0 || (size_t) written >= buffer_len - path_len) {
errno = ENAMETOOLONG;
return -1;
}
return 0;
}
static int omni_file_logger_reopen_append_locked(omni_file_logger_t *logger) {
struct stat st;
FILE *file;
if (logger == NULL || logger->path[0] == '\0') {
errno = EINVAL;
return -1;
}
file = fopen(logger->path, "ab");
if (file == NULL) {
return -1;
}
logger->file = file;
logger->current_bytes = 0U;
if (stat(logger->path, &st) == 0) {
logger->current_bytes = (size_t) st.st_size;
}
logger->buffered_bytes = 0U;
logger->last_flush_monotonic_ms = omni_now_monotonic_ms64();
return 0;
}
static int omni_file_logger_recover_after_rotate_locked(omni_file_logger_t *logger, const char *rotated_current_path) {
int reopen_errno;
if (omni_file_logger_reopen_append_locked(logger) == 0) {
return 0;
}
reopen_errno = errno;
if (rotated_current_path != NULL && rotated_current_path[0] != '\0') {
if (rename(rotated_current_path, logger->path) == 0) {
if (omni_file_logger_reopen_append_locked(logger) == 0) {
return 0;
}
}
}
errno = reopen_errno;
return -1;
}
static int omni_file_logger_rotate_locked(omni_file_logger_t *logger) {
int index;
int saved_errno = 0;
int should_recover = 0;
char rotated_current_path[PATH_MAX];
char from_path[PATH_MAX];
char to_path[PATH_MAX];
if (logger == NULL || logger->path[0] == '\0' || logger->max_bytes == 0U || logger->max_files <= 0) {
return 0;
}
rotated_current_path[0] = '\0';
if (logger->file != NULL) {
if (omni_file_logger_flush_locked(logger, omni_now_monotonic_ms64()) != 0) {
return -1;
}
should_recover = 1;
if (fclose(logger->file) != 0) {
logger->file = NULL;
saved_errno = errno;
goto recover;
}
logger->file = NULL;
}
if (omni_build_rotated_path(from_path, sizeof(from_path), logger->path, logger->max_files) != 0) {
saved_errno = errno;
goto recover;
}
unlink(from_path);
for (index = logger->max_files - 1; index >= 1; --index) {
if (omni_build_rotated_path(from_path, sizeof(from_path), logger->path, index) != 0 ||
omni_build_rotated_path(to_path, sizeof(to_path), logger->path, index + 1) != 0) {
saved_errno = errno;
goto recover;
}
if (rename(from_path, to_path) != 0 && errno != ENOENT) {
saved_errno = errno;
goto recover;
}
}
if (omni_build_rotated_path(to_path, sizeof(to_path), logger->path, 1) != 0) {
saved_errno = errno;
goto recover;
}
if (rename(logger->path, to_path) != 0 && errno != ENOENT) {
saved_errno = errno;
goto recover;
}
snprintf(rotated_current_path, sizeof(rotated_current_path), "%s", to_path);
if (omni_file_logger_reopen_append_locked(logger) != 0) {
saved_errno = errno;
goto recover;
}
return 0;
recover:
if (should_recover) {
int recover_errno = saved_errno != 0 ? saved_errno : errno;
if (omni_file_logger_recover_after_rotate_locked(logger, rotated_current_path) == 0) {
errno = recover_errno;
} else if (saved_errno != 0) {
errno = saved_errno;
}
} else if (saved_errno != 0) {
errno = saved_errno;
}
return -1;
}
void omni_file_logger_init(omni_file_logger_t *logger, FILE *file) { void omni_file_logger_init(omni_file_logger_t *logger, FILE *file) {
memset(logger, 0, sizeof(*logger));
logger->file = file; logger->file = file;
pthread_mutex_init(&logger->mutex, NULL); pthread_mutex_init(&logger->mutex, NULL);
logger->flush_bytes = 1U;
logger->flush_interval_ms = 0;
logger->immediate_flush = 1;
logger->last_flush_monotonic_ms = omni_now_monotonic_ms64();
}
void omni_file_logger_init_path(omni_file_logger_t *logger, FILE *file, const char *path, int immediate_flush) {
struct stat st;
omni_file_logger_init(logger, file);
if (path != NULL && path[0] != '\0') {
snprintf(logger->path, sizeof(logger->path), "%s", path);
if (stat(path, &st) == 0) {
logger->current_bytes = (size_t) st.st_size;
}
}
logger->flush_bytes = omni_positive_size_env("BLITZ_JSONL_FLUSH_BYTES", 262144U);
logger->flush_interval_ms = omni_positive_int_env("BLITZ_JSONL_FLUSH_INTERVAL_MS", 1000);
logger->max_bytes = omni_positive_size_env("BLITZ_JSONL_ROTATE_BYTES", 134217728U);
logger->max_files = omni_positive_int_env("BLITZ_JSONL_ROTATE_FILES", 8);
logger->immediate_flush = immediate_flush != 0;
} }
void omni_file_logger_destroy(omni_file_logger_t *logger) { void omni_file_logger_destroy(omni_file_logger_t *logger) {
@@ -555,13 +763,32 @@ void omni_file_logger_destroy(omni_file_logger_t *logger) {
int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line) { int omni_file_logger_write_line(omni_file_logger_t *logger, const char *line) {
int rc = 0; int rc = 0;
size_t line_len;
uint64_t now_ms;
if (logger == NULL || logger->file == NULL || line == NULL) { if (logger == NULL || logger->file == NULL || line == NULL) {
errno = EINVAL; errno = EINVAL;
return -1; return -1;
} }
line_len = strlen(line) + 1U;
now_ms = omni_now_monotonic_ms64();
pthread_mutex_lock(&logger->mutex); pthread_mutex_lock(&logger->mutex);
if (fputs(line, logger->file) == EOF || fputc('\n', logger->file) == EOF || fflush(logger->file) != 0) { if (fputs(line, logger->file) == EOF || fputc('\n', logger->file) == EOF) {
rc = -1; rc = -1;
} else {
logger->current_bytes += line_len;
logger->buffered_bytes += line_len;
if (logger->immediate_flush ||
logger->buffered_bytes >= logger->flush_bytes ||
(logger->flush_interval_ms > 0 && now_ms - logger->last_flush_monotonic_ms >= (uint64_t) logger->flush_interval_ms)) {
if (omni_file_logger_flush_locked(logger, now_ms) != 0) {
rc = -1;
}
}
if (rc == 0 && logger->max_bytes > 0U && logger->current_bytes >= logger->max_bytes) {
if (omni_file_logger_rotate_locked(logger) != 0) {
rc = -1;
}
}
} }
pthread_mutex_unlock(&logger->mutex); pthread_mutex_unlock(&logger->mutex);
return rc; return rc;

View File

@@ -477,6 +477,16 @@ int kcp_client_send_text(kcp_client_t *client, const char *to, const char *text)
} }
int kcp_client_send_binary(kcp_client_t *client, const char *to, const void *data, size_t data_len) { int kcp_client_send_binary(kcp_client_t *client, const char *to, const void *data, size_t data_len) {
return kcp_client_send_binary_with_id(client, to, data, data_len, NULL);
}
int kcp_client_send_binary_with_id(
kcp_client_t *client,
const char *to,
const void *data,
size_t data_len,
uint64_t *out_id
) {
message_t msg; message_t msg;
uint64_t id; uint64_t id;
@@ -508,6 +518,9 @@ int kcp_client_send_binary(kcp_client_t *client, const char *to, const void *dat
protocol_message_clear(&msg); protocol_message_clear(&msg);
return -1; return -1;
} }
if (out_id != NULL) {
*out_id = id;
}
protocol_message_clear(&msg); protocol_message_clear(&msg);
return 0; return 0;
} }

View File

@@ -228,7 +228,9 @@ static int kcp_hub_add_runtime_stats_json(cJSON *object, const kcp_runtime_stats
cJSON_AddNumberToObject(object, "conv", (double) stats->conv) == NULL || cJSON_AddNumberToObject(object, "conv", (double) stats->conv) == NULL ||
cJSON_AddNumberToObject(object, "rto_ms", (double) stats->rto_ms) == NULL || cJSON_AddNumberToObject(object, "rto_ms", (double) stats->rto_ms) == NULL ||
cJSON_AddNumberToObject(object, "srtt_ms", (double) stats->srtt_ms) == NULL || cJSON_AddNumberToObject(object, "srtt_ms", (double) stats->srtt_ms) == NULL ||
cJSON_AddNumberToObject(object, "min_srtt_ms", (double) stats->min_srtt_ms) == NULL ||
cJSON_AddNumberToObject(object, "srttvar_ms", (double) stats->srttvar_ms) == NULL || cJSON_AddNumberToObject(object, "srttvar_ms", (double) stats->srttvar_ms) == NULL ||
cJSON_AddNumberToObject(object, "last_feedback_age_ms", (double) stats->last_feedback_age_ms) == NULL ||
cJSON_AddNumberToObject(object, "snd_wnd", (double) stats->snd_wnd) == NULL || cJSON_AddNumberToObject(object, "snd_wnd", (double) stats->snd_wnd) == NULL ||
cJSON_AddNumberToObject(object, "rmt_wnd", (double) stats->rmt_wnd) == NULL || cJSON_AddNumberToObject(object, "rmt_wnd", (double) stats->rmt_wnd) == NULL ||
cJSON_AddNumberToObject(object, "inflight", (double) stats->inflight) == NULL || cJSON_AddNumberToObject(object, "inflight", (double) stats->inflight) == NULL ||

View File

@@ -1,10 +1,14 @@
#include "server_udp_relay.h" #include "server_udp_relay.h"
#include <arpa/inet.h> #include <arpa/inet.h>
#include <stdatomic.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h> #include <unistd.h>
#define UDP_RELAY_BUF_SIZE (64U * 1024U) #define UDP_RELAY_BUF_SIZE (64U * 1024U)
#define UDP_RELAY_ROUTE_TIMEOUT_MS 30000U #define UDP_RELAY_ROUTE_TIMEOUT_MS 30000U
#define UDP_RELAY_DEFAULT_PACKET_LOG_SAMPLE_EVERY 200U
struct udp_relay { struct udp_relay {
int downstream_fd; int downstream_fd;
@@ -20,6 +24,8 @@ struct udp_relay {
struct udp_relay_route *routes; struct udp_relay_route *routes;
pthread_mutex_t lock; pthread_mutex_t lock;
pthread_mutex_t log_mu; pthread_mutex_t log_mu;
unsigned int packet_log_sample_every;
atomic_ullong packet_log_counter;
pthread_mutex_t state_mu; pthread_mutex_t state_mu;
pthread_cond_t state_cond; pthread_cond_t state_cond;
pthread_t downstream_thread; pthread_t downstream_thread;
@@ -48,6 +54,44 @@ static uint32_t udp_relay_elapsed_ms(uint32_t now_ms, uint32_t then_ms) {
return now_ms - then_ms; return now_ms - then_ms;
} }
static unsigned int udp_relay_packet_log_sample_every(void) {
const char *raw = getenv("OMNI_RELAY_PACKET_LOG_SAMPLE_EVERY");
unsigned long parsed;
char *endptr = NULL;
if (raw == NULL || raw[0] == '\0') {
return UDP_RELAY_DEFAULT_PACKET_LOG_SAMPLE_EVERY;
}
parsed = strtoul(raw, &endptr, 10);
if (endptr == raw || *endptr != '\0') {
return UDP_RELAY_DEFAULT_PACKET_LOG_SAMPLE_EVERY;
}
return (unsigned int) parsed;
}
static int udp_relay_event_should_always_log(const char *event_name) {
return event_name != NULL && strstr(event_name, "_drop_") != NULL;
}
static int udp_relay_should_log_packet(udp_relay_t *relay, const char *event_name) {
unsigned long long seq;
if (relay == NULL) {
return 0;
}
if (udp_relay_event_should_always_log(event_name)) {
return 1;
}
if (relay->packet_log_sample_every == 0U) {
return 0;
}
if (relay->packet_log_sample_every == 1U) {
return 1;
}
seq = atomic_fetch_add_explicit(&relay->packet_log_counter, 1U, memory_order_relaxed) + 1U;
return (seq % (unsigned long long) relay->packet_log_sample_every) == 0U;
}
static void udp_relay_parse_kcp_summary(const uint8_t *packet, size_t len, int *has_conv, uint32_t *conv, size_t *segment_count) { static void udp_relay_parse_kcp_summary(const uint8_t *packet, size_t len, int *has_conv, uint32_t *conv, size_t *segment_count) {
size_t offset = 0; size_t offset = 0;
size_t count = 0; size_t count = 0;
@@ -99,6 +143,9 @@ static void udp_relay_print_packet(udp_relay_t *relay, const char *event_name, c
if (relay == NULL) { if (relay == NULL) {
return; return;
} }
if (!udp_relay_should_log_packet(relay, event_name)) {
return;
}
if (remote_addr != NULL && remote_addr_len > 0) { if (remote_addr != NULL && remote_addr_len > 0) {
omni_sockaddr_to_string((const struct sockaddr *) remote_addr, remote_addr_len, remote_addr_text, sizeof(remote_addr_text)); omni_sockaddr_to_string((const struct sockaddr *) remote_addr, remote_addr_len, remote_addr_text, sizeof(remote_addr_text));
@@ -461,6 +508,8 @@ udp_relay_t *udp_relay_open(const char *listen_addr, const char *upstream_addr)
} }
pthread_mutex_init(&relay->lock, NULL); pthread_mutex_init(&relay->lock, NULL);
pthread_mutex_init(&relay->log_mu, NULL); pthread_mutex_init(&relay->log_mu, NULL);
relay->packet_log_sample_every = udp_relay_packet_log_sample_every();
atomic_init(&relay->packet_log_counter, 0U);
pthread_mutex_init(&relay->state_mu, NULL); pthread_mutex_init(&relay->state_mu, NULL);
pthread_cond_init(&relay->state_cond, NULL); pthread_cond_init(&relay->state_cond, NULL);
return relay; return relay;

View File

@@ -72,6 +72,8 @@ struct kcp_conn {
uint64_t pending_in_errs; uint64_t pending_in_errs;
uint64_t pending_kcp_in_errs; uint64_t pending_kcp_in_errs;
protocol_frame_decoder_t decoder; protocol_frame_decoder_t decoder;
int32_t min_srtt_ms;
uint32_t last_feedback_ms;
uint8_t scratch[KCP_RECV_CHUNK_SIZE]; uint8_t scratch[KCP_RECV_CHUNK_SIZE];
latency_logger_t *logger; latency_logger_t *logger;
char node_role[OMNI_MAX_NODE_ROLE]; char node_role[OMNI_MAX_NODE_ROLE];
@@ -307,6 +309,26 @@ static uint64_t kcp_counter_diff(uint64_t previous, uint64_t current) {
return current < previous ? 0 : current - previous; return current < previous ? 0 : current - previous;
} }
static void kcp_conn_update_min_srtt_locked(kcp_conn_t *conn) {
int32_t srtt_ms;
if (conn == NULL || conn->kcp == NULL) {
return;
}
srtt_ms = conn->kcp->rx_srtt;
if (srtt_ms > 0 && (conn->min_srtt_ms <= 0 || srtt_ms < conn->min_srtt_ms)) {
conn->min_srtt_ms = srtt_ms;
}
}
static void kcp_conn_note_feedback_locked(kcp_conn_t *conn) {
if (conn == NULL) {
return;
}
conn->last_feedback_ms = omni_now_millis32();
kcp_conn_update_min_srtt_locked(conn);
}
static int kcp_process_sampler_matches(const kcp_process_sampler_t *sampler, kcp_session_stats_logger_t *logger, const char *node_role, const char *node_id, int stats_interval_ms) { static int kcp_process_sampler_matches(const kcp_process_sampler_t *sampler, kcp_session_stats_logger_t *logger, const char *node_role, const char *node_id, int stats_interval_ms) {
if (sampler == NULL) { if (sampler == NULL) {
return 0; return 0;
@@ -729,20 +751,6 @@ static void kcp_process_sampler_release(kcp_process_sampler_t *sampler) {
free(sampler); free(sampler);
} }
static void kcp_process_sampler_request_sample(kcp_process_sampler_t *sampler, const char *reason) {
if (sampler == NULL) {
return;
}
pthread_mutex_lock(&sampler->lock);
if (!sampler->stopped && !sampler->request_pending) {
sampler->request_pending = 1;
sampler->pending_request_id++;
snprintf(sampler->pending_reason, sizeof(sampler->pending_reason), "%s", reason == NULL ? "" : reason);
pthread_cond_broadcast(&sampler->cond);
}
pthread_mutex_unlock(&sampler->lock);
}
static void kcp_process_sampler_request_sample_and_wait(kcp_process_sampler_t *sampler, const char *reason) { static void kcp_process_sampler_request_sample_and_wait(kcp_process_sampler_t *sampler, const char *reason) {
uint64_t request_id; uint64_t request_id;
@@ -1107,8 +1115,13 @@ static void kcp_log_session_snapshot(kcp_conn_t *conn, const char *reason) {
record.rto_ms = conn->kcp->rx_rto; record.rto_ms = conn->kcp->rx_rto;
record.has_srtt_ms = 1; record.has_srtt_ms = 1;
record.srtt_ms = conn->kcp->rx_srtt; record.srtt_ms = conn->kcp->rx_srtt;
kcp_conn_update_min_srtt_locked(conn);
record.has_min_srtt_ms = conn->min_srtt_ms > 0;
record.min_srtt_ms = conn->min_srtt_ms;
record.has_srttvar_ms = 1; record.has_srttvar_ms = 1;
record.srttvar_ms = conn->kcp->rx_rttval; record.srttvar_ms = conn->kcp->rx_rttval;
record.has_last_feedback_age_ms = conn->last_feedback_ms != 0;
record.last_feedback_age_ms = conn->last_feedback_ms == 0 ? 0 : (omni_now_millis32() - conn->last_feedback_ms);
record.has_snd_wnd = 1; record.has_snd_wnd = 1;
record.snd_wnd = conn->kcp->snd_wnd; record.snd_wnd = conn->kcp->snd_wnd;
record.has_rmt_wnd = 1; record.has_rmt_wnd = 1;
@@ -1282,6 +1295,7 @@ static void *kcp_client_recv_thread_main(void *arg) {
if (ikcp_input(conn->kcp, (const char *) buffer, n) != 0) { if (ikcp_input(conn->kcp, (const char *) buffer, n) != 0) {
kcp_conn_record_error(conn); kcp_conn_record_error(conn);
} else { } else {
kcp_conn_note_feedback_locked(conn);
kcp_conn_record_input(conn, (int) n, segment_count); kcp_conn_record_input(conn, (int) n, segment_count);
} }
pthread_mutex_unlock(&conn->kcp_mu); pthread_mutex_unlock(&conn->kcp_mu);
@@ -1644,6 +1658,7 @@ static void *kcp_listener_recv_thread_main(void *arg) {
if (ikcp_input(conn->kcp, (const char *) buffer, n) != 0) { if (ikcp_input(conn->kcp, (const char *) buffer, n) != 0) {
kcp_conn_record_error(conn); kcp_conn_record_error(conn);
} else { } else {
kcp_conn_note_feedback_locked(conn);
kcp_conn_record_input(conn, (int) n, segment_count); kcp_conn_record_input(conn, (int) n, segment_count);
} }
pthread_mutex_unlock(&conn->kcp_mu); pthread_mutex_unlock(&conn->kcp_mu);
@@ -1771,8 +1786,6 @@ int kcp_conn_send(kcp_conn_t *conn, const message_t *msg) {
return -1; return -1;
} }
latencylog_log_message_event(conn->logger, conn->node_role, conn->node_id, EVENT_SEND_HANDOFF_BEGIN, msg); latencylog_log_message_event(conn->logger, conn->node_role, conn->node_id, EVENT_SEND_HANDOFF_BEGIN, msg);
kcp_log_session_snapshot(conn, "send_handoff_begin");
kcp_process_sampler_request_sample(conn->process_sampler, "send_handoff_begin");
pthread_mutex_lock(&conn->kcp_mu); pthread_mutex_lock(&conn->kcp_mu);
atomic_store(&conn->sock_state->last_send_errno, 0); atomic_store(&conn->sock_state->last_send_errno, 0);
conn->kcp->current = omni_now_millis32(); conn->kcp->current = omni_now_millis32();
@@ -1791,8 +1804,6 @@ int kcp_conn_send(kcp_conn_t *conn, const message_t *msg) {
free(frame); free(frame);
return -1; return -1;
} }
kcp_log_session_snapshot(conn, "send_handoff_end");
kcp_process_sampler_request_sample(conn->process_sampler, "send_handoff_end");
latencylog_log_message_event(conn->logger, conn->node_role, conn->node_id, EVENT_SEND_HANDOFF_END, msg); latencylog_log_message_event(conn->logger, conn->node_role, conn->node_id, EVENT_SEND_HANDOFF_END, msg);
free(frame); free(frame);
return 0; return 0;
@@ -1835,8 +1846,6 @@ int kcp_conn_receive_timed(kcp_conn_t *conn, message_t *out_msg, int timeout_ms)
return -1; return -1;
} }
free(frame); free(frame);
kcp_log_session_snapshot(conn, "receive");
kcp_process_sampler_request_sample(conn->process_sampler, "receive");
return 0; return 0;
} }
pthread_mutex_lock(&conn->kcp_mu); pthread_mutex_lock(&conn->kcp_mu);
@@ -1927,7 +1936,10 @@ void kcp_conn_runtime_stats_snapshot(kcp_conn_t *conn, kcp_runtime_stats_t *out_
out_stats->conv = conn->kcp->conv; out_stats->conv = conn->kcp->conv;
out_stats->rto_ms = conn->kcp->rx_rto; out_stats->rto_ms = conn->kcp->rx_rto;
out_stats->srtt_ms = conn->kcp->rx_srtt; out_stats->srtt_ms = conn->kcp->rx_srtt;
kcp_conn_update_min_srtt_locked(conn);
out_stats->min_srtt_ms = conn->min_srtt_ms;
out_stats->srttvar_ms = conn->kcp->rx_rttval; out_stats->srttvar_ms = conn->kcp->rx_rttval;
out_stats->last_feedback_age_ms = conn->last_feedback_ms == 0 ? 0 : (omni_now_millis32() - conn->last_feedback_ms);
out_stats->snd_wnd = conn->kcp->snd_wnd; out_stats->snd_wnd = conn->kcp->snd_wnd;
out_stats->rmt_wnd = conn->kcp->rmt_wnd; out_stats->rmt_wnd = conn->kcp->rmt_wnd;
out_stats->inflight = conn->kcp->snd_nxt - conn->kcp->snd_una; out_stats->inflight = conn->kcp->snd_nxt - conn->kcp->snd_una;

View File

@@ -18,7 +18,7 @@ tx_timestamp_debug_logger_t *tx_timestamp_debug_open_jsonl(const char *path) {
fclose(file); fclose(file);
return NULL; return NULL;
} }
omni_file_logger_init(&logger->file_logger, file); omni_file_logger_init_path(&logger->file_logger, file, path, 0);
logger->enabled = 1; logger->enabled = 1;
return logger; return logger;
} }

View File

@@ -31,8 +31,10 @@
#define VIDEO_SOFT_BACKPRESSURE_SEGMENTS_DEFAULT 64 #define VIDEO_SOFT_BACKPRESSURE_SEGMENTS_DEFAULT 64
#define VIDEO_HARD_BACKPRESSURE_SEGMENTS_DEFAULT 192 #define VIDEO_HARD_BACKPRESSURE_SEGMENTS_DEFAULT 192
#define VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT 1000 #define VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT 1000
#define VIDEO_DEFAULT_FRAME_STALL_RECONNECT_MS 3000
#define VIDEO_SOFT_BACKPRESSURE_WINDOW_PRESSURE_PCT 90.0 #define VIDEO_SOFT_BACKPRESSURE_WINDOW_PRESSURE_PCT 90.0
#define VIDEO_HARD_BACKPRESSURE_WINDOW_PRESSURE_PCT 98.0 #define VIDEO_HARD_BACKPRESSURE_WINDOW_PRESSURE_PCT 98.0
#define VIDEO_SESSION_POLL_INTERVAL_MS 250
typedef struct video_buffer { typedef struct video_buffer {
void *start; void *start;
@@ -44,6 +46,7 @@ typedef struct video_sender {
char target_peer[OMNI_MAX_PEER_ID]; char target_peer[OMNI_MAX_PEER_ID];
uint8_t *send_buffer; uint8_t *send_buffer;
size_t send_buffer_cap; size_t send_buffer_cap;
uint64_t next_frame_seq;
} video_sender_t; } video_sender_t;
static int video_pipeline_stop_requested(volatile sig_atomic_t *stop_requested) { static int video_pipeline_stop_requested(volatile sig_atomic_t *stop_requested) {
@@ -180,6 +183,13 @@ static void video_pipeline_set_errno_error(video_pipeline_stats_t *stats, const
video_pipeline_set_error(stats, buffer); video_pipeline_set_error(stats, buffer);
} }
static void video_pipeline_report_progress(const video_pipeline_config_t *config) {
if (config == NULL || config->progress_callback == NULL) {
return;
}
config->progress_callback(config->progress_context);
}
void video_pipeline_config_init(video_pipeline_config_t *config) { void video_pipeline_config_init(video_pipeline_config_t *config) {
if (config == NULL) { if (config == NULL) {
return; return;
@@ -201,6 +211,10 @@ void video_pipeline_config_init(video_pipeline_config_t *config) {
config->soft_backpressure_segments = VIDEO_SOFT_BACKPRESSURE_SEGMENTS_DEFAULT; config->soft_backpressure_segments = VIDEO_SOFT_BACKPRESSURE_SEGMENTS_DEFAULT;
config->hard_backpressure_segments = VIDEO_HARD_BACKPRESSURE_SEGMENTS_DEFAULT; config->hard_backpressure_segments = VIDEO_HARD_BACKPRESSURE_SEGMENTS_DEFAULT;
config->hard_backpressure_hold_ms = VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT; config->hard_backpressure_hold_ms = VIDEO_HARD_BACKPRESSURE_HOLD_MS_DEFAULT;
config->frame_stall_reconnect_ms = VIDEO_DEFAULT_FRAME_STALL_RECONNECT_MS;
config->stats_logger = NULL;
config->stage_logger = NULL;
config->stats_interval_ms = 1000;
} }
void video_pipeline_config_load_env(video_pipeline_config_t *config) { void video_pipeline_config_load_env(video_pipeline_config_t *config) {
@@ -221,6 +235,8 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config) {
config->soft_backpressure_segments = env_int_or_default("OMNI_VIDEO_SOFT_BACKPRESSURE_SEGMENTS", config->soft_backpressure_segments); config->soft_backpressure_segments = env_int_or_default("OMNI_VIDEO_SOFT_BACKPRESSURE_SEGMENTS", config->soft_backpressure_segments);
config->hard_backpressure_segments = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_SEGMENTS", config->hard_backpressure_segments); config->hard_backpressure_segments = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_SEGMENTS", config->hard_backpressure_segments);
config->hard_backpressure_hold_ms = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS", config->hard_backpressure_hold_ms); config->hard_backpressure_hold_ms = env_int_or_default("OMNI_VIDEO_HARD_BACKPRESSURE_HOLD_MS", config->hard_backpressure_hold_ms);
config->frame_stall_reconnect_ms = env_int_or_default("OMNI_VIDEO_FRAME_STALL_RECONNECT_MS", config->frame_stall_reconnect_ms);
config->stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", config->stats_interval_ms);
} }
int video_pipeline_stats_init(video_pipeline_stats_t *stats) { int video_pipeline_stats_init(video_pipeline_stats_t *stats) {
@@ -258,6 +274,8 @@ void video_pipeline_stats_snapshot(video_pipeline_stats_t *stats, video_pipeline
out_stats->backlog_resets = stats->backlog_resets; out_stats->backlog_resets = stats->backlog_resets;
out_stats->last_frame_bytes = stats->last_frame_bytes; out_stats->last_frame_bytes = stats->last_frame_bytes;
out_stats->last_backlog_segments = stats->last_backlog_segments; out_stats->last_backlog_segments = stats->last_backlog_segments;
out_stats->last_capture_to_send_ms = stats->last_capture_to_send_ms;
out_stats->avg_capture_to_send_ms = stats->avg_capture_to_send_ms;
out_stats->connected = stats->connected; out_stats->connected = stats->connected;
snprintf(out_stats->last_error, sizeof(out_stats->last_error), "%s", stats->last_error); snprintf(out_stats->last_error, sizeof(out_stats->last_error), "%s", stats->last_error);
snprintf(out_stats->last_backlog_reason, sizeof(out_stats->last_backlog_reason), "%s", stats->last_backlog_reason); snprintf(out_stats->last_backlog_reason, sizeof(out_stats->last_backlog_reason), "%s", stats->last_backlog_reason);
@@ -586,8 +604,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config
&options, &options,
NULL, NULL,
NULL, NULL,
NULL, config->stats_logger,
KCP_DEFAULT_STATS_INTERVAL_MS config->stats_interval_ms
); );
if (sender->client == NULL) { if (sender->client == NULL) {
return -1; return -1;
@@ -596,6 +614,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config
} }
static int video_sender_drain_pending_messages(video_sender_t *sender) { static int video_sender_drain_pending_messages(video_sender_t *sender) {
int drained = 0;
if (sender == NULL || sender->client == NULL) { if (sender == NULL || sender->client == NULL) {
errno = EINVAL; errno = EINVAL;
return -1; return -1;
@@ -618,16 +638,22 @@ static int video_sender_drain_pending_messages(video_sender_t *sender) {
// Drain unread server errors so an offline receiver cannot back up the reverse KCP stream. // Drain unread server errors so an offline receiver cannot back up the reverse KCP stream.
protocol_message_clear(&msg); protocol_message_clear(&msg);
drained += 1;
if (drained >= 8) {
return 0;
}
} }
} }
static int video_sender_send_packet( static int video_sender_send_packet(
video_sender_t *sender, video_sender_t *sender,
const AVPacket *encoded_pkt, const AVPacket *encoded_pkt,
const video_pipeline_packet_metadata_t *metadata const video_pipeline_packet_metadata_t *metadata,
uint64_t *out_frame_seq
) { ) {
uint8_t *payload; uint8_t *payload;
size_t payload_len; size_t payload_len;
uint64_t frame_seq;
int rc; int rc;
if (sender == NULL || sender->client == NULL || encoded_pkt == NULL || metadata == NULL) { if (sender == NULL || sender->client == NULL || encoded_pkt == NULL || metadata == NULL) {
@@ -635,18 +661,31 @@ static int video_sender_send_packet(
return -1; return -1;
} }
payload_len = (size_t) encoded_pkt->size + sizeof(*metadata); frame_seq = sender->next_frame_seq + 1U;
payload_len = 8U + (size_t) encoded_pkt->size + sizeof(*metadata);
if (video_sender_ensure_buffer_capacity(sender, payload_len) != 0) { if (video_sender_ensure_buffer_capacity(sender, payload_len) != 0) {
return -1; return -1;
} }
payload = sender->send_buffer; payload = sender->send_buffer;
memcpy(payload, encoded_pkt->data, (size_t) encoded_pkt->size); payload[0] = (uint8_t) (frame_seq >> 56);
memcpy(payload + encoded_pkt->size, metadata, sizeof(*metadata)); payload[1] = (uint8_t) (frame_seq >> 48);
payload[2] = (uint8_t) (frame_seq >> 40);
payload[3] = (uint8_t) (frame_seq >> 32);
payload[4] = (uint8_t) (frame_seq >> 24);
payload[5] = (uint8_t) (frame_seq >> 16);
payload[6] = (uint8_t) (frame_seq >> 8);
payload[7] = (uint8_t) frame_seq;
memcpy(payload + 8U, encoded_pkt->data, (size_t) encoded_pkt->size);
memcpy(payload + 8U + (size_t) encoded_pkt->size, metadata, sizeof(*metadata));
rc = kcp_client_send_binary(sender->client, sender->target_peer, payload, payload_len); rc = kcp_client_send_binary(sender->client, sender->target_peer, payload, payload_len);
if (rc != 0) { if (rc != 0) {
return rc; return rc;
} }
sender->next_frame_seq = frame_seq;
if (out_frame_seq != NULL) {
*out_frame_seq = frame_seq;
}
rc = video_sender_drain_pending_messages(sender); rc = video_sender_drain_pending_messages(sender);
return rc; return rc;
} }
@@ -715,6 +754,181 @@ static void video_pipeline_note_backpressure(
pthread_mutex_unlock(&stats->mutex); pthread_mutex_unlock(&stats->mutex);
} }
static void video_pipeline_note_capture_to_send(video_pipeline_stats_t *stats, uint32_t capture_to_send_ms) {
if (stats == NULL) {
return;
}
pthread_mutex_lock(&stats->mutex);
stats->last_capture_to_send_ms = capture_to_send_ms;
if (stats->avg_capture_to_send_ms <= 0.0) {
stats->avg_capture_to_send_ms = (double) capture_to_send_ms;
} else {
stats->avg_capture_to_send_ms = stats->avg_capture_to_send_ms * 0.9 + (double) capture_to_send_ms * 0.1;
}
pthread_mutex_unlock(&stats->mutex);
}
static int video_stage_logger_should_log(const video_stage_logger_t *logger, uint64_t frame_seq) {
if (logger == NULL || !logger->enabled) {
return 0;
}
if (logger->sample_mod <= 1U) {
return 1;
}
return frame_seq % logger->sample_mod == 0U;
}
static void video_stage_logger_log_frame(
video_stage_logger_t *logger,
uint64_t frame_seq,
double capture_ms,
double decode_ms,
double scale_ms,
double encode_ms,
double send_ms,
double pipeline_total_ms,
size_t jpeg_bytes,
uint64_t kcp_out_seg_delta,
uint32_t backlog_segments,
double window_pressure_pct,
int32_t video_srtt_ms
) {
char *line;
if (!video_stage_logger_should_log(logger, frame_seq)) {
return;
}
line = omni_strdup_printf(
"{\"ts_unix_nano\":%" PRId64 ",\"frame_seq\":%" PRIu64 ",\"capture_ms\":%.3f,\"decode_ms\":%.3f,\"scale_ms\":%.3f,\"encode_ms\":%.3f,\"send_ms\":%.3f,\"pipeline_total_ms\":%.3f,\"jpeg_bytes\":%zu,\"kcp_out_seg_delta\":%" PRIu64 ",\"backlog_segments\":%u,\"window_pressure_pct\":%.3f,\"video_srtt_ms\":%d}",
omni_now_unix_nano(),
frame_seq,
capture_ms,
decode_ms,
scale_ms,
encode_ms,
send_ms,
pipeline_total_ms,
jpeg_bytes,
kcp_out_seg_delta,
backlog_segments,
window_pressure_pct,
video_srtt_ms
);
if (line == NULL) {
return;
}
(void) omni_file_logger_write_line(&logger->file_logger, line);
free(line);
}
video_stage_logger_t *video_stage_logger_open_jsonl(const char *path, uint64_t sample_mod) {
video_stage_logger_t *logger;
FILE *file;
if (path == NULL || path[0] == '\0') {
return NULL;
}
if (omni_ensure_parent_dir(path) != 0) {
return NULL;
}
file = fopen(path, "ab");
if (file == NULL) {
return NULL;
}
logger = (video_stage_logger_t *) calloc(1, sizeof(*logger));
if (logger == NULL) {
fclose(file);
return NULL;
}
omni_file_logger_init_path(&logger->file_logger, file, path, 0);
logger->enabled = 1;
logger->sample_mod = sample_mod == 0U ? 1U : sample_mod;
return logger;
}
void video_stage_logger_close(video_stage_logger_t *logger) {
if (logger == NULL) {
return;
}
if (logger->file_logger.file != NULL) {
fclose(logger->file_logger.file);
}
omni_file_logger_destroy(&logger->file_logger);
free(logger);
}
static int video_server_error_requires_reconnect(const char *message) {
if (message == NULL || message[0] == '\0') {
return 0;
}
return strstr(message, "not registered") != NULL
|| strstr(message, "first message must be register") != NULL
|| strstr(message, "peer replaced") != NULL
|| strstr(message, "timed out waiting for server_register_ok") != NULL
|| strstr(message, "failed to acknowledge server heartbeat") != NULL;
}
static void video_pipeline_update_connection_state(
video_pipeline_stats_t *stats,
const kcp_client_state_t *client_state,
const kcp_runtime_stats_t *transport
) {
if (stats == NULL) {
return;
}
pthread_mutex_lock(&stats->mutex);
if (transport != NULL) {
stats->transport = *transport;
}
if (client_state != NULL) {
stats->connected = client_state->connected != 0 && client_state->registered != 0;
if (client_state->last_server_error[0] != '\0') {
snprintf(stats->last_error, sizeof(stats->last_error), "%s", client_state->last_server_error);
}
}
pthread_mutex_unlock(&stats->mutex);
}
static int video_sender_check_session_stale(
video_sender_t *sender,
const video_pipeline_config_t *config,
video_pipeline_stats_t *stats,
kcp_runtime_stats_t *transport_stats,
char *reason,
size_t reason_len
) {
kcp_client_state_t client_state;
if (
sender == NULL || sender->client == NULL || config == NULL || stats == NULL || transport_stats == NULL
|| reason == NULL || reason_len == 0
) {
errno = EINVAL;
return -1;
}
reason[0] = '\0';
memset(&client_state, 0, sizeof(client_state));
kcp_client_runtime_stats_snapshot(sender->client, transport_stats);
kcp_client_state_snapshot(sender->client, &client_state);
video_pipeline_update_connection_state(stats, &client_state, transport_stats);
if (!transport_stats->connected || !client_state.connected) {
snprintf(reason, reason_len, "video session stale: transport disconnected");
return 1;
}
if (!client_state.registered) {
snprintf(reason, reason_len, "video session stale: server reported unregistered");
return 1;
}
if (video_server_error_requires_reconnect(client_state.last_server_error)) {
snprintf(reason, reason_len, "video session stale: server error %.180s", client_state.last_server_error);
return 1;
}
return 0;
}
static void video_pipeline_cleanup_buffers(video_buffer_t *buffers, int num_buffers) { static void video_pipeline_cleanup_buffers(video_buffer_t *buffers, int num_buffers) {
int i; int i;
if (buffers == NULL) { if (buffers == NULL) {
@@ -745,6 +959,12 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
int sws_src_format = -1; int sws_src_format = -1;
uint32_t hard_backpressure_since_ms = 0; uint32_t hard_backpressure_since_ms = 0;
uint32_t last_soft_drop_log_ms = 0; uint32_t last_soft_drop_log_ms = 0;
uint32_t last_session_poll_ms = 0;
uint32_t last_successful_send_ms = 0;
uint64_t soft_drops_since_last_send = 0;
int have_sent_frame = 0;
const char *gpsd_host = env_or_default("OMNI_GPSD_HOST", "127.0.0.1");
int gps_buffer_started = 0;
memset(&sender, 0, sizeof(sender)); memset(&sender, 0, sizeof(sender));
if (stats == NULL) { if (stats == NULL) {
@@ -792,6 +1012,11 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
video_pipeline_set_errno_error(stats, "failed to start video sender"); video_pipeline_set_errno_error(stats, "failed to start video sender");
goto cleanup; goto cleanup;
} }
if (gps_buffer_init(gpsd_host) != 0) {
fprintf(stderr, "[video_pipeline] failed to start GPS buffer using %s:2947\n", gpsd_host);
} else {
gps_buffer_started = 1;
}
pthread_mutex_lock(&stats->mutex); pthread_mutex_lock(&stats->mutex);
stats->connected = 1; stats->connected = 1;
@@ -829,7 +1054,9 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
AVFrame *scaled_frame = NULL; AVFrame *scaled_frame = NULL;
AVPacket *encoded_pkt = NULL; AVPacket *encoded_pkt = NULL;
kcp_runtime_stats_t transport_stats; kcp_runtime_stats_t transport_stats;
kcp_runtime_stats_t transport_after_send;
int select_rc; int select_rc;
int should_log_stage = 0;
double total_start_ms = 0.0; double total_start_ms = 0.0;
double capture_start_ms = 0.0; double capture_start_ms = 0.0;
double capture_end_ms = 0.0; double capture_end_ms = 0.0;
@@ -842,17 +1069,23 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
double send_start_ms = 0.0; double send_start_ms = 0.0;
double send_end_ms = 0.0; double send_end_ms = 0.0;
video_pipeline_packet_metadata_t packet_metadata; video_pipeline_packet_metadata_t packet_metadata;
char reconnect_reason[256];
int frame_number = frame_index + 1; int frame_number = frame_index + 1;
uint64_t frame_seq = 0;
uint64_t out_segs_before_send = 0;
uint64_t out_segs_after_send = 0;
uint32_t capture_to_send_ms = 0;
memset(&transport_stats, 0, sizeof(transport_stats)); memset(&transport_stats, 0, sizeof(transport_stats));
memset(&transport_after_send, 0, sizeof(transport_after_send));
memset(&packet_metadata, 0, sizeof(packet_metadata)); memset(&packet_metadata, 0, sizeof(packet_metadata));
reconnect_reason[0] = '\0';
video_pipeline_report_progress(config);
if (config->max_frames > 0 && frame_index >= config->max_frames) { if (config->max_frames > 0 && frame_index >= config->max_frames) {
break; break;
} }
if (config->enable_timing_logs) {
total_start_ms = video_pipeline_now_ms(); total_start_ms = video_pipeline_now_ms();
}
FD_ZERO(&fds); FD_ZERO(&fds);
FD_SET(fd, &fds); FD_SET(fd, &fds);
@@ -866,9 +1099,7 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
video_pipeline_set_errno_error(stats, "failed waiting for camera frame"); video_pipeline_set_errno_error(stats, "failed waiting for camera frame");
goto cleanup; goto cleanup;
} }
if (config->enable_timing_logs) {
capture_start_ms = video_pipeline_now_ms(); capture_start_ms = video_pipeline_now_ms();
}
memset(&buf, 0, sizeof(buf)); memset(&buf, 0, sizeof(buf));
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
@@ -877,10 +1108,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
video_pipeline_set_errno_error(stats, "failed to dequeue V4L2 buffer"); video_pipeline_set_errno_error(stats, "failed to dequeue V4L2 buffer");
goto cleanup; goto cleanup;
} }
if (config->enable_timing_logs) {
capture_end_ms = video_pipeline_now_ms(); capture_end_ms = video_pipeline_now_ms();
decode_start_ms = capture_end_ms; decode_start_ms = capture_end_ms;
}
if (decode_mjpeg_frame(decoder, (const uint8_t *) buffers[buf.index].start, (int) buf.bytesused, &decoded_frame) != 0) { if (decode_mjpeg_frame(decoder, (const uint8_t *) buffers[buf.index].start, (int) buf.bytesused, &decoded_frame) != 0) {
if (config->enable_timing_logs) { if (config->enable_timing_logs) {
@@ -889,10 +1118,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
(void) ioctl(fd, VIDIOC_QBUF, &buf); (void) ioctl(fd, VIDIOC_QBUF, &buf);
continue; continue;
} }
if (config->enable_timing_logs) {
decode_end_ms = video_pipeline_now_ms(); decode_end_ms = video_pipeline_now_ms();
scale_start_ms = decode_end_ms; scale_start_ms = decode_end_ms;
}
if ( if (
ensure_scale_context( ensure_scale_context(
&sws_ctx, &sws_ctx,
@@ -912,10 +1139,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
(void) ioctl(fd, VIDIOC_QBUF, &buf); (void) ioctl(fd, VIDIOC_QBUF, &buf);
continue; continue;
} }
if (config->enable_timing_logs) {
scale_end_ms = video_pipeline_now_ms(); scale_end_ms = video_pipeline_now_ms();
encode_start_ms = scale_end_ms; encode_start_ms = scale_end_ms;
}
if (encode_frame(encoder, scaled_frame, &encoded_pkt) != 0) { if (encode_frame(encoder, scaled_frame, &encoded_pkt) != 0) {
if (config->enable_timing_logs) { if (config->enable_timing_logs) {
video_pipeline_print_timing_failure(frame_number, "encode"); video_pipeline_print_timing_failure(frame_number, "encode");
@@ -925,10 +1150,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
(void) ioctl(fd, VIDIOC_QBUF, &buf); (void) ioctl(fd, VIDIOC_QBUF, &buf);
continue; continue;
} }
if (config->enable_timing_logs) {
encode_end_ms = video_pipeline_now_ms(); encode_end_ms = video_pipeline_now_ms();
send_start_ms = encode_end_ms; send_start_ms = encode_end_ms;
}
{ {
gps_video_sample_t gps_sample = get_latest_gps_for_video(); gps_video_sample_t gps_sample = get_latest_gps_for_video();
@@ -938,7 +1161,45 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
packet_metadata.longitude = gps_sample.longitude; packet_metadata.longitude = gps_sample.longitude;
} }
if (
last_session_poll_ms == 0
|| omni_now_millis32() - last_session_poll_ms >= VIDEO_SESSION_POLL_INTERVAL_MS
) {
if (video_sender_drain_pending_messages(&sender) != 0) {
video_pipeline_set_errno_error(stats, "failed to poll video session");
av_frame_free(&decoded_frame);
av_frame_free(&scaled_frame);
av_packet_free(&encoded_pkt);
(void) ioctl(fd, VIDIOC_QBUF, &buf);
rc = VIDEO_PIPELINE_RUN_RETRY_IMMEDIATE;
goto cleanup;
}
if (
video_sender_check_session_stale(
&sender,
config,
stats,
&transport_stats,
reconnect_reason,
sizeof(reconnect_reason)
) != 0
) {
if (reconnect_reason[0] == '\0') {
snprintf(reconnect_reason, sizeof(reconnect_reason), "video session stale: poll failed");
}
video_pipeline_set_error(stats, reconnect_reason);
fprintf(stderr, "[video_pipeline] %s\n", reconnect_reason);
av_frame_free(&decoded_frame);
av_frame_free(&scaled_frame);
av_packet_free(&encoded_pkt);
(void) ioctl(fd, VIDIOC_QBUF, &buf);
rc = VIDEO_PIPELINE_RUN_RETRY_IMMEDIATE;
goto cleanup;
}
last_session_poll_ms = omni_now_millis32();
} else {
kcp_client_runtime_stats_snapshot(sender.client, &transport_stats); kcp_client_runtime_stats_snapshot(sender.client, &transport_stats);
}
if (video_sender_hard_backpressure_active(config, &transport_stats)) { if (video_sender_hard_backpressure_active(config, &transport_stats)) {
uint32_t now_ms = omni_now_millis32(); uint32_t now_ms = omni_now_millis32();
@@ -997,6 +1258,7 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
config->soft_backpressure_segments config->soft_backpressure_segments
); );
video_pipeline_note_backpressure(stats, reason, &transport_stats, 1, 0); video_pipeline_note_backpressure(stats, reason, &transport_stats, 1, 0);
soft_drops_since_last_send += 1;
if (now_ms - last_soft_drop_log_ms >= 1000U) { if (now_ms - last_soft_drop_log_ms >= 1000U) {
fprintf( fprintf(
stderr, stderr,
@@ -1009,6 +1271,31 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
); );
last_soft_drop_log_ms = now_ms; last_soft_drop_log_ms = now_ms;
} }
if (
have_sent_frame
&& config->frame_stall_reconnect_ms > 0
&& now_ms - last_successful_send_ms >= (uint32_t) config->frame_stall_reconnect_ms
) {
char stall_reason[192];
snprintf(
stall_reason,
sizeof(stall_reason),
"video pipeline stalled: no frames sent for %u ms while soft dropping (%llu drops, backlog=%u, srtt=%d ms)",
now_ms - last_successful_send_ms,
(unsigned long long) soft_drops_since_last_send,
backlog_segments,
transport_stats.srtt_ms
);
video_pipeline_set_error(stats, stall_reason);
fprintf(stderr, "[video_pipeline] %s\n", stall_reason);
av_frame_free(&decoded_frame);
av_frame_free(&scaled_frame);
av_packet_free(&encoded_pkt);
(void) ioctl(fd, VIDIOC_QBUF, &buf);
rc = VIDEO_PIPELINE_RUN_RETRY_IMMEDIATE;
goto cleanup;
}
av_frame_free(&decoded_frame); av_frame_free(&decoded_frame);
av_frame_free(&scaled_frame); av_frame_free(&scaled_frame);
av_packet_free(&encoded_pkt); av_packet_free(&encoded_pkt);
@@ -1016,7 +1303,13 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
continue; continue;
} }
if (video_sender_send_packet(&sender, encoded_pkt, &packet_metadata) != 0) { capture_to_send_ms = send_start_ms <= capture_start_ms
? 0U
: (uint32_t) (send_start_ms - capture_start_ms + 0.5);
packet_metadata.capture_to_send_ms = capture_to_send_ms;
out_segs_before_send = transport_stats.out_segs_total;
if (video_sender_send_packet(&sender, encoded_pkt, &packet_metadata, &frame_seq) != 0) {
pthread_mutex_lock(&stats->mutex); pthread_mutex_lock(&stats->mutex);
stats->send_errors += 1; stats->send_errors += 1;
pthread_mutex_unlock(&stats->mutex); pthread_mutex_unlock(&stats->mutex);
@@ -1030,16 +1323,43 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
(void) ioctl(fd, VIDIOC_QBUF, &buf); (void) ioctl(fd, VIDIOC_QBUF, &buf);
goto cleanup; goto cleanup;
} }
if (config->enable_timing_logs) {
send_end_ms = video_pipeline_now_ms(); send_end_ms = video_pipeline_now_ms();
should_log_stage = video_stage_logger_should_log(config->stage_logger, frame_seq);
if (should_log_stage) {
kcp_client_runtime_stats_snapshot(sender.client, &transport_after_send);
out_segs_after_send = transport_after_send.out_segs_total;
} else {
transport_after_send = transport_stats;
out_segs_after_send = out_segs_before_send;
} }
video_pipeline_note_capture_to_send(stats, capture_to_send_ms);
pthread_mutex_lock(&stats->mutex); pthread_mutex_lock(&stats->mutex);
stats->frames_sent += 1; stats->frames_sent += 1;
stats->bytes_sent += (uint64_t) encoded_pkt->size; stats->bytes_sent += (uint64_t) encoded_pkt->size;
stats->last_frame_bytes = (uint64_t) encoded_pkt->size; stats->last_frame_bytes = (uint64_t) encoded_pkt->size;
kcp_client_runtime_stats_snapshot(sender.client, &stats->transport); stats->transport = transport_after_send;
pthread_mutex_unlock(&stats->mutex); pthread_mutex_unlock(&stats->mutex);
have_sent_frame = 1;
last_successful_send_ms = omni_now_millis32();
soft_drops_since_last_send = 0;
if (should_log_stage) {
video_stage_logger_log_frame(
config->stage_logger,
frame_seq,
capture_end_ms - capture_start_ms,
decode_end_ms - decode_start_ms,
scale_end_ms - scale_start_ms,
encode_end_ms - encode_start_ms,
send_end_ms - send_start_ms,
send_end_ms - total_start_ms,
(size_t) encoded_pkt->size,
out_segs_after_send >= out_segs_before_send ? out_segs_after_send - out_segs_before_send : 0U,
video_sender_backlog_segments(&transport_after_send),
transport_after_send.window_pressure_pct,
transport_after_send.srtt_ms
);
}
if (config->enable_timing_logs) { if (config->enable_timing_logs) {
video_pipeline_print_timing_row( video_pipeline_print_timing_row(
frame_number, frame_number,
@@ -1070,6 +1390,9 @@ cleanup:
pthread_mutex_lock(&stats->mutex); pthread_mutex_lock(&stats->mutex);
stats->connected = 0; stats->connected = 0;
pthread_mutex_unlock(&stats->mutex); pthread_mutex_unlock(&stats->mutex);
if (gps_buffer_started) {
gps_buffer_cleanup();
}
if (fd >= 0) { if (fd >= 0) {
(void) ioctl(fd, VIDIOC_STREAMOFF, &type); (void) ioctl(fd, VIDIOC_STREAMOFF, &type);
} }

View File

@@ -161,6 +161,13 @@ static void video_pipeline_set_errno_error(video_pipeline_stats_t *stats, const
video_pipeline_set_error(stats, buffer); video_pipeline_set_error(stats, buffer);
} }
static void video_pipeline_report_progress(const video_pipeline_config_t *config) {
if (config == NULL || config->progress_callback == NULL) {
return;
}
config->progress_callback(config->progress_context);
}
void video_pipeline_config_init(video_pipeline_config_t *config) { void video_pipeline_config_init(video_pipeline_config_t *config) {
if (config == NULL) { if (config == NULL) {
return; return;
@@ -179,6 +186,8 @@ void video_pipeline_config_init(video_pipeline_config_t *config) {
config->output_height = VIDEO_OUTPUT_HEIGHT_DEFAULT; config->output_height = VIDEO_OUTPUT_HEIGHT_DEFAULT;
config->max_frames = 0; config->max_frames = 0;
config->enable_timing_logs = 0; config->enable_timing_logs = 0;
config->stats_logger = NULL;
config->stats_interval_ms = 1000;
} }
void video_pipeline_config_load_env(video_pipeline_config_t *config) { void video_pipeline_config_load_env(video_pipeline_config_t *config) {
@@ -196,6 +205,7 @@ void video_pipeline_config_load_env(video_pipeline_config_t *config) {
config->max_frames = atoi(getenv("OMNI_VIDEO_MAX_FRAMES")); config->max_frames = atoi(getenv("OMNI_VIDEO_MAX_FRAMES"));
} }
config->enable_timing_logs = env_flag_or_default("OMNI_VIDEO_DEBUG_TIMING", config->enable_timing_logs); config->enable_timing_logs = env_flag_or_default("OMNI_VIDEO_DEBUG_TIMING", config->enable_timing_logs);
config->stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", config->stats_interval_ms);
} }
int video_pipeline_stats_init(video_pipeline_stats_t *stats) { int video_pipeline_stats_init(video_pipeline_stats_t *stats) {
@@ -557,8 +567,8 @@ static int video_sender_init(video_sender_t *sender, const video_pipeline_config
&options, &options,
NULL, NULL,
NULL, NULL,
NULL, config->stats_logger,
KCP_DEFAULT_STATS_INTERVAL_MS config->stats_interval_ms
); );
if (sender->client == NULL) { if (sender->client == NULL) {
return -1; return -1;
@@ -757,6 +767,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
double send_end_ms = 0.0; double send_end_ms = 0.0;
int frame_number = frame_index + 1; int frame_number = frame_index + 1;
video_pipeline_report_progress(config);
if (config->max_frames > 0 && frame_index >= config->max_frames) { if (config->max_frames > 0 && frame_index >= config->max_frames) {
break; break;
} }