Files
OmniSocketGo/cmd/b_side_omnid.c
2026-04-18 12:52:39 +08:00

1284 lines
48 KiB
C

#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdatomic.h>
#include <stdint.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <time.h>
#include <unistd.h>
#include "cJSON.h"
#include "control_protocol.h"
#include "latencylog.h"
#include "protocol.h"
#include "video_pipeline.h"
#define CONTROL_DEFAULT_PEER_ID "peer-b-ctrl"
#define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl"
#define CONTROL_ACK_DEFAULT_PEER_ID "peer-b-ctrl-ack"
#define CONTROL_ACK_DEFAULT_TARGET_PEER "peer-a-ctrl-ack"
#define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock"
#define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000
#define DEFAULT_RUNTIME_DIR "/run/blitz-robot"
#define DEFAULT_STATUS_FILE_NAME "b-side-omnid.status.json"
#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall"
#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall"
#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15
#define DEFAULT_KCP_STATS_INTERVAL_MS 1000
#define DEFAULT_CONTROL_LATENCY_SAMPLE_MOD 100
#define DEFAULT_CONTROL_ACK_SAMPLE_MOD 10
#define EXIT_CODE_VIDEO_THREAD_STALLED 101
#define EXIT_CODE_CONTROL_THREAD_STALLED 102
typedef struct unix_dgram_client {
int fd;
char bind_path[108];
char dest_path[108];
struct sockaddr_un dest_addr;
socklen_t dest_len;
} unix_dgram_client_t;
typedef struct control_bridge_stats {
pthread_mutex_t mutex;
uint64_t packets_forwarded;
uint64_t invalid_packets;
uint64_t unix_send_errors;
uint64_t reconnect_count;
uint32_t server_idle_ms;
int ever_connected;
int registered;
char last_error[256];
char last_reconnect_reason[256];
kcp_runtime_stats_t transport;
} control_bridge_stats_t;
typedef struct daemon_state {
volatile sig_atomic_t *stop_requested;
video_pipeline_config_t video_config;
video_pipeline_stats_t video_stats;
const char *control_server_addr;
const char *control_relay_via;
const char *control_bind_ip;
const char *control_bind_device;
const char *control_peer_id;
const char *control_expected_sender;
const char *control_ack_peer_id;
const char *control_ack_target_peer;
const char *control_unix_socket;
int control_server_idle_reconnect_ms;
const char *runtime_dir;
int heartbeat_timeout_sec;
int stats_interval_ms;
uint64_t control_latency_sample_mod;
uint64_t control_ack_sample_mod;
char status_file_path[512];
char video_thread_fault_file[512];
char control_thread_fault_file[512];
atomic_long video_thread_heartbeat_epoch_sec;
atomic_long control_thread_heartbeat_epoch_sec;
atomic_int control_ack_shutdown_requested;
kcp_session_stats_logger_t *stats_logger;
latency_logger_t *control_latency_logger;
video_stage_logger_t *video_stage_logger;
unix_dgram_client_t unix_client;
control_bridge_stats_t control_stats;
pthread_mutex_t control_ack_mutex;
pthread_t control_ack_thread;
kcp_client_t *control_ack_client;
int control_ack_thread_started;
int control_ack_connect_requested;
int control_ack_connect_inflight;
} daemon_state_t;
static volatile sig_atomic_t g_stop_requested = 0;
static void handle_signal(int signum) {
(void) signum;
g_stop_requested = 1;
}
static int install_signal_handler(int signum) {
struct sigaction action;
memset(&action, 0, sizeof(action));
action.sa_handler = handle_signal;
action.sa_flags = SA_RESTART;
if (sigemptyset(&action.sa_mask) != 0) {
return -1;
}
return sigaction(signum, &action, NULL);
}
static const char *env_or_default(const char *name, const char *fallback) {
const char *value = getenv(name);
if (value != NULL && value[0] != '\0') {
return value;
}
return fallback;
}
static const char *env_first_nonempty(const char *first, const char *second, const char *fallback) {
const char *value = getenv(first);
if (value != NULL && value[0] != '\0') {
return value;
}
value = getenv(second);
if (value != NULL && value[0] != '\0') {
return value;
}
return fallback;
}
static int env_int_or_default(const char *name, int fallback) {
const char *value = getenv(name);
int parsed;
if (value == NULL || value[0] == '\0') {
return fallback;
}
parsed = atoi(value);
if (parsed <= 0) {
return fallback;
}
return parsed;
}
static uint64_t env_u64_or_default(const char *name, uint64_t fallback) {
const char *value = getenv(name);
unsigned long long parsed = 0ULL;
char *endptr = NULL;
if (value == NULL || value[0] == '\0') {
return fallback;
}
parsed = strtoull(value, &endptr, 10);
if (endptr == value || *endptr != '\0' || parsed == 0ULL) {
return fallback;
}
return (uint64_t) parsed;
}
static int64_t realtime_epoch_ms(void) {
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return (int64_t) ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
}
static long realtime_epoch_sec(void) {
return (long) time(NULL);
}
static void update_thread_heartbeat(atomic_long *heartbeat) {
if (heartbeat == NULL) {
return;
}
atomic_store(heartbeat, realtime_epoch_sec());
}
static int should_log_control_latency(const daemon_state_t *state, const message_t *msg) {
uint64_t sample_mod;
if (state == NULL || state->control_latency_logger == NULL || msg == NULL) {
return 0;
}
sample_mod = state->control_latency_sample_mod;
if (sample_mod <= 1U) {
return 1;
}
return msg->id % sample_mod == 0U;
}
static int should_send_control_ack(const daemon_state_t *state, const message_t *msg) {
uint64_t sample_mod;
if (state == NULL || msg == NULL) {
return 0;
}
sample_mod = state->control_ack_sample_mod;
if (sample_mod <= 1U) {
return 1;
}
return msg->id % sample_mod == 0U;
}
static void video_pipeline_heartbeat_progress(void *context) {
update_thread_heartbeat((atomic_long *) context);
}
static int ensure_runtime_dir(const char *runtime_dir) {
struct stat st;
if (runtime_dir == NULL || runtime_dir[0] == '\0') {
errno = EINVAL;
return -1;
}
if (stat(runtime_dir, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
return 0;
}
errno = ENOTDIR;
return -1;
}
if (errno != ENOENT) {
return -1;
}
if (mkdir(runtime_dir, 0775) != 0 && errno != EEXIST) {
return -1;
}
return 0;
}
static int path_exists(const char *path) {
return path != NULL && path[0] != '\0' && access(path, F_OK) == 0;
}
static int consume_fault_flag(const char *path) {
if (!path_exists(path)) {
return 0;
}
unlink(path);
return 1;
}
static void maybe_inject_thread_stall(daemon_state_t *state, const char *fault_path, const char *thread_name) {
if (state == NULL || fault_path == NULL || thread_name == NULL) {
return;
}
if (!consume_fault_flag(fault_path)) {
return;
}
fprintf(
stderr,
"[b_side_omnid] fault injection requested for %s thread, sleeping past %d second heartbeat timeout\n",
thread_name,
state->heartbeat_timeout_sec
);
sleep((unsigned int) state->heartbeat_timeout_sec + 2U);
}
static int control_bridge_stats_init(control_bridge_stats_t *stats) {
int rc;
if (stats == NULL) {
errno = EINVAL;
return -1;
}
memset(stats, 0, sizeof(*stats));
rc = pthread_mutex_init(&stats->mutex, NULL);
if (rc != 0) {
errno = rc;
return -1;
}
return 0;
}
static void control_bridge_stats_destroy(control_bridge_stats_t *stats) {
if (stats == NULL) {
return;
}
pthread_mutex_destroy(&stats->mutex);
}
static void unix_dgram_client_close(unix_dgram_client_t *client);
static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats);
static void close_control_ack_client(kcp_client_t **client_ptr);
static int control_ack_enabled(const daemon_state_t *state) {
return state != NULL
&& state->control_ack_peer_id != NULL
&& state->control_ack_peer_id[0] != '\0'
&& state->control_ack_target_peer != NULL
&& state->control_ack_target_peer[0] != '\0';
}
static int control_ack_manager_init(daemon_state_t *state) {
int rc;
if (state == NULL) {
errno = EINVAL;
return -1;
}
rc = pthread_mutex_init(&state->control_ack_mutex, NULL);
if (rc != 0) {
errno = rc;
return -1;
}
atomic_init(&state->control_ack_shutdown_requested, 0);
state->control_ack_client = NULL;
state->control_ack_thread_started = 0;
state->control_ack_connect_requested = 0;
state->control_ack_connect_inflight = 0;
return 0;
}
static void control_ack_manager_reset(daemon_state_t *state, int request_connect) {
kcp_client_t *client = NULL;
if (state == NULL) {
return;
}
pthread_mutex_lock(&state->control_ack_mutex);
client = state->control_ack_client;
state->control_ack_client = NULL;
state->control_ack_connect_requested = request_connect && control_ack_enabled(state) && state->control_ack_thread_started;
pthread_mutex_unlock(&state->control_ack_mutex);
close_control_ack_client(&client);
}
static void control_ack_manager_destroy(daemon_state_t *state) {
if (state == NULL) {
return;
}
atomic_store(&state->control_ack_shutdown_requested, 1);
if (state->control_ack_thread_started) {
pthread_join(state->control_ack_thread, NULL);
state->control_ack_thread_started = 0;
}
control_ack_manager_reset(state, 0);
pthread_mutex_destroy(&state->control_ack_mutex);
}
static int write_status_json_atomic(const char *path, cJSON *root) {
char *json;
char temp_path[640];
FILE *file;
size_t json_len;
if (path == NULL || root == NULL) {
errno = EINVAL;
return -1;
}
json = cJSON_PrintUnformatted(root);
if (json == NULL) {
errno = ENOMEM;
return -1;
}
snprintf(temp_path, sizeof(temp_path), "%s.tmp.%ld", path, (long) getpid());
file = fopen(temp_path, "wb");
if (file == NULL) {
cJSON_free(json);
return -1;
}
json_len = strlen(json);
if (fwrite(json, 1, json_len, file) != json_len || fflush(file) != 0) {
int saved_errno = errno;
fclose(file);
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
if (fclose(file) != 0) {
int saved_errno = errno;
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
if (rename(temp_path, path) != 0) {
int saved_errno = errno;
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
cJSON_free(json);
return 0;
}
static int write_daemon_status_file(daemon_state_t *state) {
cJSON *root;
video_pipeline_stats_t video_stats;
control_bridge_stats_t control_stats;
int rc;
if (state == NULL) {
errno = EINVAL;
return -1;
}
if (ensure_runtime_dir(state->runtime_dir) != 0) {
return -1;
}
memset(&video_stats, 0, sizeof(video_stats));
memset(&control_stats, 0, sizeof(control_stats));
video_pipeline_stats_snapshot(&state->video_stats, &video_stats);
control_bridge_stats_snapshot(&state->control_stats, &control_stats);
root = cJSON_CreateObject();
if (root == NULL) {
errno = ENOMEM;
return -1;
}
cJSON_AddNumberToObject(root, "updated_at_epoch_ms", (double) realtime_epoch_ms());
cJSON_AddNumberToObject(root, "pid", (double) getpid());
cJSON_AddNumberToObject(root, "video_thread_heartbeat_epoch_ms", (double) atomic_load(&state->video_thread_heartbeat_epoch_sec) * 1000.0);
cJSON_AddNumberToObject(root, "control_thread_heartbeat_epoch_ms", (double) atomic_load(&state->control_thread_heartbeat_epoch_sec) * 1000.0);
cJSON_AddBoolToObject(root, "video_connected", video_stats.connected != 0);
cJSON_AddNumberToObject(root, "video_frames_sent", (double) video_stats.frames_sent);
cJSON_AddNumberToObject(root, "video_send_errors", (double) video_stats.send_errors);
cJSON_AddNumberToObject(root, "video_backlog_resets", (double) video_stats.backlog_resets);
cJSON_AddNumberToObject(root, "video_last_capture_to_send_ms", (double) video_stats.last_capture_to_send_ms);
cJSON_AddNumberToObject(root, "video_avg_capture_to_send_ms", video_stats.avg_capture_to_send_ms);
cJSON_AddStringToObject(root, "video_last_error", video_stats.last_error);
cJSON_AddBoolToObject(root, "control_registered", control_stats.registered != 0);
cJSON_AddNumberToObject(root, "control_reconnect_count", (double) control_stats.reconnect_count);
cJSON_AddNumberToObject(root, "control_unix_send_errors", (double) control_stats.unix_send_errors);
cJSON_AddStringToObject(root, "control_last_error", control_stats.last_error);
rc = write_status_json_atomic(state->status_file_path, root);
cJSON_Delete(root);
return rc;
}
static int thread_heartbeat_expired(atomic_long *heartbeat, int timeout_sec, long now_sec) {
long heartbeat_sec;
if (heartbeat == NULL || timeout_sec <= 0) {
return 0;
}
heartbeat_sec = atomic_load(heartbeat);
if (heartbeat_sec <= 0) {
return 0;
}
return now_sec - heartbeat_sec > timeout_sec;
}
static void exit_if_thread_stalled(daemon_state_t *state) {
long now_sec;
if (state == NULL || state->heartbeat_timeout_sec <= 0) {
return;
}
now_sec = realtime_epoch_sec();
if (thread_heartbeat_expired(&state->video_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
fprintf(stderr, "[b_side_omnid] video thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
fflush(stderr);
exit(EXIT_CODE_VIDEO_THREAD_STALLED);
}
if (thread_heartbeat_expired(&state->control_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
fprintf(stderr, "[b_side_omnid] control thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
fflush(stderr);
exit(EXIT_CODE_CONTROL_THREAD_STALLED);
}
}
static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) {
if (stats == NULL) {
return;
}
pthread_mutex_lock(&stats->mutex);
snprintf(stats->last_error, sizeof(stats->last_error), "%s", message == NULL ? "" : message);
pthread_mutex_unlock(&stats->mutex);
}
static void control_bridge_set_reconnect_reason(control_bridge_stats_t *stats, const char *message) {
if (stats == NULL) {
return;
}
pthread_mutex_lock(&stats->mutex);
snprintf(stats->last_reconnect_reason, sizeof(stats->last_reconnect_reason), "%s", message == NULL ? "" : message);
pthread_mutex_unlock(&stats->mutex);
}
static void control_bridge_set_errno_error(control_bridge_stats_t *stats, const char *prefix) {
char buffer[256];
int saved_errno = errno;
snprintf(
buffer,
sizeof(buffer),
"%s: %s (errno=%d)",
prefix == NULL ? "control bridge error" : prefix,
saved_errno != 0 ? strerror(saved_errno) : "unknown error",
saved_errno
);
control_bridge_set_error(stats, buffer);
}
static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats) {
if (stats == NULL || out_stats == NULL) {
return;
}
memset(out_stats, 0, sizeof(*out_stats));
pthread_mutex_lock(&stats->mutex);
out_stats->packets_forwarded = stats->packets_forwarded;
out_stats->invalid_packets = stats->invalid_packets;
out_stats->unix_send_errors = stats->unix_send_errors;
out_stats->reconnect_count = stats->reconnect_count;
out_stats->server_idle_ms = stats->server_idle_ms;
out_stats->registered = stats->registered;
snprintf(out_stats->last_error, sizeof(out_stats->last_error), "%s", stats->last_error);
snprintf(out_stats->last_reconnect_reason, sizeof(out_stats->last_reconnect_reason), "%s", stats->last_reconnect_reason);
out_stats->transport = stats->transport;
pthread_mutex_unlock(&stats->mutex);
}
static int control_server_error_requires_reconnect(const char *message) {
if (message == NULL || message[0] == '\0') {
return 0;
}
return strstr(message, "not registered") != NULL
|| strstr(message, "first message must be register") != NULL
|| strstr(message, "peer replaced") != NULL
|| strstr(message, "timed out waiting for server_register_ok") != NULL;
}
static void control_message_body_to_cstr(const message_t *msg, char *buffer, size_t buffer_len) {
size_t copy_len;
if (buffer == NULL || buffer_len == 0) {
return;
}
buffer[0] = '\0';
if (msg == NULL || msg->body == NULL || msg->body_len == 0) {
return;
}
copy_len = msg->body_len < (buffer_len - 1U) ? msg->body_len : (buffer_len - 1U);
memcpy(buffer, msg->body, copy_len);
buffer[copy_len] = '\0';
}
static kcp_client_t *connect_control_ack_client(const daemon_state_t *state) {
kcp_conn_options_t options;
if (state == NULL || state->control_ack_peer_id == NULL || state->control_ack_peer_id[0] == '\0') {
errno = EINVAL;
return NULL;
}
kcp_conn_options_set_control_defaults(&options);
return kcp_client_dial_with_options(
state->control_server_addr,
state->control_relay_via,
state->control_ack_peer_id,
state->control_bind_ip,
state->control_bind_device,
&options,
NULL,
NULL,
state->stats_logger,
state->stats_interval_ms
);
}
static void close_control_ack_client(kcp_client_t **client_ptr) {
if (client_ptr == NULL || *client_ptr == NULL) {
return;
}
kcp_client_close(*client_ptr);
kcp_client_free(*client_ptr);
*client_ptr = NULL;
}
static void control_ack_manager_request_connect(daemon_state_t *state) {
if (state == NULL || !control_ack_enabled(state) || !state->control_ack_thread_started) {
return;
}
pthread_mutex_lock(&state->control_ack_mutex);
if (state->control_ack_client == NULL) {
state->control_ack_connect_requested = 1;
}
pthread_mutex_unlock(&state->control_ack_mutex);
}
static void *control_ack_thread_main(void *arg) {
daemon_state_t *state = (daemon_state_t *) arg;
while (!atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested) {
kcp_client_t *client = NULL;
int connect_failed = 0;
int should_connect = 0;
pthread_mutex_lock(&state->control_ack_mutex);
if (state->control_ack_connect_requested && state->control_ack_client == NULL && !state->control_ack_connect_inflight) {
state->control_ack_connect_inflight = 1;
should_connect = 1;
}
pthread_mutex_unlock(&state->control_ack_mutex);
if (!should_connect) {
usleep(200000);
continue;
}
client = connect_control_ack_client(state);
connect_failed = client == NULL;
pthread_mutex_lock(&state->control_ack_mutex);
state->control_ack_connect_inflight = 0;
if (
client != NULL
&& state->control_ack_connect_requested
&& state->control_ack_client == NULL
&& !atomic_load(&state->control_ack_shutdown_requested)
&& !*state->stop_requested
) {
state->control_ack_client = client;
state->control_ack_connect_requested = 0;
client = NULL;
}
pthread_mutex_unlock(&state->control_ack_mutex);
if (client != NULL) {
close_control_ack_client(&client);
}
if (connect_failed && !atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested) {
sleep(1);
}
}
return NULL;
}
static void maybe_send_control_ack(
daemon_state_t *state,
const message_t *msg,
int64_t recv_unix_nano,
int64_t persist_end_unix_nano,
const char *sample_reason
) {
kcp_client_t *ack_client = NULL;
kcp_client_t *client_to_close = NULL;
char *payload = NULL;
int send_rc = -1;
if (
state == NULL || msg == NULL || recv_unix_nano <= 0 || persist_end_unix_nano <= recv_unix_nano
|| !control_ack_enabled(state) || !state->control_ack_thread_started
) {
return;
}
payload = omni_strdup_printf(
"{\"message_id\":%" PRIu64 ",\"ack_phase\":\"persist_end\",\"b_recv_to_persist_us\":%" PRId64 ",\"unix_send_ok\":true,\"sample_reason\":\"%s\"}",
msg->id,
(persist_end_unix_nano - recv_unix_nano) / 1000,
sample_reason == NULL ? "sample_mod" : sample_reason
);
if (payload == NULL) {
return;
}
pthread_mutex_lock(&state->control_ack_mutex);
ack_client = state->control_ack_client;
if (ack_client == NULL) {
state->control_ack_connect_requested = 1;
pthread_mutex_unlock(&state->control_ack_mutex);
free(payload);
return;
}
send_rc = kcp_client_send_text(ack_client, state->control_ack_target_peer, payload);
if (send_rc != 0) {
client_to_close = state->control_ack_client;
state->control_ack_client = NULL;
state->control_ack_connect_requested = 1;
}
pthread_mutex_unlock(&state->control_ack_mutex);
free(payload);
if (client_to_close != NULL) {
close_control_ack_client(&client_to_close);
}
}
static int unix_dgram_client_init(unix_dgram_client_t *client, const char *dest_path) {
struct sockaddr_un bind_addr;
pid_t pid;
if (client == NULL || dest_path == NULL || dest_path[0] == '\0') {
errno = EINVAL;
return -1;
}
memset(client, 0, sizeof(*client));
client->fd = socket(AF_UNIX, SOCK_DGRAM, 0);
if (client->fd < 0) {
return -1;
}
memset(&bind_addr, 0, sizeof(bind_addr));
bind_addr.sun_family = AF_UNIX;
pid = getpid();
snprintf(client->bind_path, sizeof(client->bind_path), "/tmp/omnisocket-b-side-cmd-client-%ld.sock", (long) pid);
unlink(client->bind_path);
snprintf(bind_addr.sun_path, sizeof(bind_addr.sun_path), "%s", client->bind_path);
if (bind(client->fd, (const struct sockaddr *) &bind_addr, sizeof(bind_addr)) != 0) {
close(client->fd);
unlink(client->bind_path);
client->fd = -1;
return -1;
}
memset(&client->dest_addr, 0, sizeof(client->dest_addr));
client->dest_addr.sun_family = AF_UNIX;
snprintf(client->dest_path, sizeof(client->dest_path), "%s", dest_path);
snprintf(client->dest_addr.sun_path, sizeof(client->dest_addr.sun_path), "%s", dest_path);
client->dest_len = (socklen_t) sizeof(client->dest_addr);
return 0;
}
static int unix_dgram_client_send(unix_dgram_client_t *client, const void *data, size_t len) {
ssize_t written;
if (client == NULL || client->fd < 0 || (data == NULL && len > 0)) {
errno = EINVAL;
return -1;
}
written = sendto(client->fd, data, len, 0, (const struct sockaddr *) &client->dest_addr, client->dest_len);
if (written < 0 || (size_t) written != len) {
if (written >= 0) {
errno = EIO;
}
return -1;
}
return 0;
}
static int unix_dgram_client_reopen(unix_dgram_client_t *client) {
char dest_path[sizeof(client->dest_path)];
if (client == NULL || client->dest_path[0] == '\0') {
errno = EINVAL;
return -1;
}
snprintf(dest_path, sizeof(dest_path), "%s", client->dest_path);
unix_dgram_client_close(client);
return unix_dgram_client_init(client, dest_path);
}
static int unix_dgram_client_should_reopen(int error_code) {
return error_code == ENOENT || error_code == ECONNREFUSED || error_code == EBADF || error_code == ENOTCONN;
}
static void unix_dgram_client_close(unix_dgram_client_t *client) {
if (client == NULL) {
return;
}
if (client->fd >= 0) {
close(client->fd);
client->fd = -1;
}
if (client->bind_path[0] != '\0') {
unlink(client->bind_path);
client->bind_path[0] = '\0';
}
}
static void *video_thread_main(void *arg) {
daemon_state_t *state = (daemon_state_t *) arg;
while (!*state->stop_requested) {
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
maybe_inject_thread_stall(state, state->video_thread_fault_file, "video");
int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested);
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
if (video_rc == 0) {
break;
}
if (video_rc == VIDEO_PIPELINE_RUN_RETRY_IMMEDIATE) {
continue;
}
if (!*state->stop_requested) {
sleep(1);
}
}
return NULL;
}
static void *control_thread_main(void *arg) {
daemon_state_t *state = (daemon_state_t *) arg;
while (!*state->stop_requested) {
kcp_conn_options_t options;
kcp_client_t *client = NULL;
int reconnect_immediately = 0;
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
maybe_inject_thread_stall(state, state->control_thread_fault_file, "control");
kcp_conn_options_set_control_defaults(&options);
client = kcp_client_dial_with_options(
state->control_server_addr,
state->control_relay_via,
state->control_peer_id,
state->control_bind_ip,
state->control_bind_device,
&options,
NULL,
NULL,
state->stats_logger,
state->stats_interval_ms
);
if (client == NULL) {
control_bridge_set_errno_error(&state->control_stats, "failed to connect control session");
sleep(1);
continue;
}
{
kcp_client_state_t client_state;
memset(&client_state, 0, sizeof(client_state));
kcp_client_state_snapshot(client, &client_state);
pthread_mutex_lock(&state->control_stats.mutex);
if (state->control_stats.ever_connected) {
state->control_stats.reconnect_count += 1;
} else {
state->control_stats.ever_connected = 1;
}
state->control_stats.registered = client_state.registered;
state->control_stats.server_idle_ms = client_state.server_idle_ms;
state->control_stats.last_reconnect_reason[0] = '\0';
snprintf(state->control_stats.last_error, sizeof(state->control_stats.last_error), "%s", client_state.last_server_error);
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex);
}
control_ack_manager_request_connect(state);
while (!*state->stop_requested) {
message_t msg;
int rc;
kcp_client_state_t client_state;
int ack_sampled = 0;
int log_control_latency = 0;
int64_t recv_unix_nano = 0;
int64_t persist_begin_unix_nano = 0;
int64_t persist_end_unix_nano = 0;
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
protocol_message_init(&msg);
rc = kcp_client_receive_timed(client, &msg, 100);
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
if (rc == 1) {
char reconnect_reason[256];
protocol_message_clear(&msg);
memset(&client_state, 0, sizeof(client_state));
kcp_client_state_snapshot(client, &client_state);
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.registered = client_state.registered;
state->control_stats.server_idle_ms = client_state.server_idle_ms;
snprintf(state->control_stats.last_error, sizeof(state->control_stats.last_error), "%s", client_state.last_server_error);
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex);
if (!client_state.registered) {
snprintf(reconnect_reason, sizeof(reconnect_reason), "control session stale: server reported unregistered");
} else if (
state->control_server_idle_reconnect_ms > 0
&& client_state.server_idle_ms >= (uint32_t) state->control_server_idle_reconnect_ms
) {
snprintf(
reconnect_reason,
sizeof(reconnect_reason),
"control session stale: server idle timeout (%u ms >= %d ms)",
client_state.server_idle_ms,
state->control_server_idle_reconnect_ms
);
} else if (control_server_error_requires_reconnect(client_state.last_server_error)) {
snprintf(
reconnect_reason,
sizeof(reconnect_reason),
"control session stale: server error %.180s",
client_state.last_server_error
);
} else {
reconnect_reason[0] = '\0';
}
if (reconnect_reason[0] != '\0') {
control_bridge_set_error(&state->control_stats, reconnect_reason);
control_bridge_set_reconnect_reason(&state->control_stats, reconnect_reason);
fprintf(stderr, "[b_side_omnid] %s\n", reconnect_reason);
reconnect_immediately = 1;
break;
}
continue;
}
if (rc != 0) {
memset(&client_state, 0, sizeof(client_state));
kcp_client_state_snapshot(client, &client_state);
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.registered = client_state.registered;
state->control_stats.server_idle_ms = client_state.server_idle_ms;
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex);
if (client_state.last_server_error[0] != '\0') {
control_bridge_set_error(&state->control_stats, client_state.last_server_error);
if (control_server_error_requires_reconnect(client_state.last_server_error)) {
control_bridge_set_reconnect_reason(&state->control_stats, client_state.last_server_error);
reconnect_immediately = 1;
}
} else {
control_bridge_set_errno_error(&state->control_stats, "control receive loop stopped");
}
protocol_message_clear(&msg);
break;
}
if (msg.type == MSG_TYPE_ERROR && strcmp(msg.from, SERVER_PEER_ID) == 0) {
char server_error[256];
control_message_body_to_cstr(&msg, server_error, sizeof(server_error));
control_bridge_set_error(&state->control_stats, server_error);
if (control_server_error_requires_reconnect(server_error)) {
char reconnect_reason[256];
snprintf(reconnect_reason, sizeof(reconnect_reason), "control session stale: server error %.180s", server_error);
control_bridge_set_reconnect_reason(&state->control_stats, reconnect_reason);
fprintf(stderr, "[b_side_omnid] %s\n", reconnect_reason);
reconnect_immediately = 1;
protocol_message_clear(&msg);
break;
}
protocol_message_clear(&msg);
continue;
}
if (state->control_expected_sender[0] != '\0' && strcmp(msg.from, state->control_expected_sender) != 0) {
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.invalid_packets += 1;
pthread_mutex_unlock(&state->control_stats.mutex);
protocol_message_clear(&msg);
continue;
}
if (msg.type != MSG_TYPE_BINARY || msg.body_len != OMNI_CONTROL_PACKET_SIZE) {
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.invalid_packets += 1;
pthread_mutex_unlock(&state->control_stats.mutex);
protocol_message_clear(&msg);
continue;
}
ack_sampled = should_send_control_ack(state, &msg);
log_control_latency = ack_sampled || should_log_control_latency(state, &msg);
if (log_control_latency) {
recv_unix_nano = omni_now_unix_nano();
persist_begin_unix_nano = recv_unix_nano;
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_APP_RECV,
recv_unix_nano,
&msg
);
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_PERSIST_BEGIN,
persist_begin_unix_nano,
&msg
);
}
if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) {
int send_errno = errno;
int recovered = 0;
if (unix_dgram_client_should_reopen(send_errno) && unix_dgram_client_reopen(&state->unix_client) == 0) {
recovered = unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) == 0;
}
if (recovered) {
memset(&client_state, 0, sizeof(client_state));
kcp_client_state_snapshot(client, &client_state);
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.packets_forwarded += 1;
state->control_stats.registered = client_state.registered;
state->control_stats.server_idle_ms = client_state.server_idle_ms;
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex);
if (log_control_latency) {
persist_end_unix_nano = omni_now_unix_nano();
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_PERSIST_END,
persist_end_unix_nano,
&msg
);
}
if (ack_sampled) {
maybe_send_control_ack(state, &msg, recv_unix_nano, persist_end_unix_nano, "sample_mod");
}
protocol_message_clear(&msg);
continue;
}
errno = send_errno;
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.unix_send_errors += 1;
pthread_mutex_unlock(&state->control_stats.mutex);
control_bridge_set_errno_error(&state->control_stats, "failed to forward command to unix socket");
protocol_message_clear(&msg);
continue;
}
memset(&client_state, 0, sizeof(client_state));
kcp_client_state_snapshot(client, &client_state);
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.packets_forwarded += 1;
state->control_stats.registered = client_state.registered;
state->control_stats.server_idle_ms = client_state.server_idle_ms;
kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport);
pthread_mutex_unlock(&state->control_stats.mutex);
if (log_control_latency) {
persist_end_unix_nano = omni_now_unix_nano();
latencylog_log_message_event_at(
state->control_latency_logger,
OMNI_NODE_ROLE_PEER,
state->control_peer_id,
EVENT_B_PERSIST_END,
persist_end_unix_nano,
&msg
);
}
if (ack_sampled) {
maybe_send_control_ack(state, &msg, recv_unix_nano, persist_end_unix_nano, "sample_mod");
}
protocol_message_clear(&msg);
}
pthread_mutex_lock(&state->control_stats.mutex);
state->control_stats.registered = 0;
state->control_stats.server_idle_ms = 0;
pthread_mutex_unlock(&state->control_stats.mutex);
control_ack_manager_reset(state, 0);
kcp_client_close(client);
kcp_client_free(client);
if (!*state->stop_requested && !reconnect_immediately) {
sleep(1);
}
}
return NULL;
}
static void print_stats(daemon_state_t *state) {
video_pipeline_stats_t video_stats;
control_bridge_stats_t control_stats;
memset(&video_stats, 0, sizeof(video_stats));
memset(&control_stats, 0, sizeof(control_stats));
video_pipeline_stats_snapshot(&state->video_stats, &video_stats);
control_bridge_stats_snapshot(&state->control_stats, &control_stats);
fprintf(
stderr,
"[b_side_omnid] video registered=%d frames=%llu bytes=%llu drops=%llu resets=%llu backlog=%u cap2send=%ums avg=%.1fms reason=%s srtt=%dms | control registered=%d idle=%ums reconnects=%llu forwarded=%llu invalid=%llu unix_err=%llu srtt=%dms last_reconnect=%s\n",
video_stats.connected,
(unsigned long long) video_stats.frames_sent,
(unsigned long long) video_stats.bytes_sent,
(unsigned long long) video_stats.backpressure_drops,
(unsigned long long) video_stats.backlog_resets,
video_stats.last_backlog_segments,
video_stats.last_capture_to_send_ms,
video_stats.avg_capture_to_send_ms,
video_stats.last_backlog_reason[0] == '\0' ? "-" : video_stats.last_backlog_reason,
video_stats.transport.srtt_ms,
control_stats.registered,
control_stats.server_idle_ms,
(unsigned long long) control_stats.reconnect_count,
(unsigned long long) control_stats.packets_forwarded,
(unsigned long long) control_stats.invalid_packets,
(unsigned long long) control_stats.unix_send_errors,
control_stats.transport.srtt_ms,
control_stats.last_reconnect_reason[0] == '\0' ? "-" : control_stats.last_reconnect_reason
);
}
int main(void) {
daemon_state_t state;
pthread_t video_thread;
pthread_t control_thread;
long initial_heartbeat;
memset(&state, 0, sizeof(state));
state.stop_requested = &g_stop_requested;
video_pipeline_config_init(&state.video_config);
video_pipeline_config_load_env(&state.video_config);
state.control_server_addr = env_first_nonempty("OMNI_CONTROL_SERVER_ADDR", "OMNISOCKET_SERVER_ADDR", "");
state.control_relay_via = env_first_nonempty("OMNI_CONTROL_RELAY_VIA", "OMNISOCKET_RELAY_VIA", "");
state.control_bind_ip = env_first_nonempty("OMNI_CONTROL_BIND_IP", "OMNISOCKET_BIND_IP", "");
state.control_bind_device = env_first_nonempty("OMNI_CONTROL_BIND_DEVICE", "OMNISOCKET_BIND_DEVICE", "");
state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID);
state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER);
state.control_ack_peer_id = env_or_default("OMNI_CONTROL_ACK_PEER_ID", CONTROL_ACK_DEFAULT_PEER_ID);
state.control_ack_target_peer = env_or_default("OMNI_CONTROL_ACK_TARGET_PEER", CONTROL_ACK_DEFAULT_TARGET_PEER);
state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET);
state.runtime_dir = env_or_default("BLITZ_RUNTIME_DIR", DEFAULT_RUNTIME_DIR);
state.heartbeat_timeout_sec = env_int_or_default(
"BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC",
DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC
);
state.stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", DEFAULT_KCP_STATS_INTERVAL_MS);
state.control_latency_sample_mod = env_u64_or_default("BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD", DEFAULT_CONTROL_LATENCY_SAMPLE_MOD);
state.control_ack_sample_mod = env_u64_or_default("BLITZ_CONTROL_ACK_SAMPLE_MOD", DEFAULT_CONTROL_ACK_SAMPLE_MOD);
state.video_config.progress_callback = video_pipeline_heartbeat_progress;
state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec;
state.video_config.stats_logger = NULL;
state.video_config.stage_logger = NULL;
state.video_config.stats_interval_ms = state.stats_interval_ms;
state.control_server_idle_reconnect_ms = env_int_or_default(
"OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS",
CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS
);
snprintf(state.status_file_path, sizeof(state.status_file_path), "%s/%s", state.runtime_dir, DEFAULT_STATUS_FILE_NAME);
snprintf(
state.video_thread_fault_file,
sizeof(state.video_thread_fault_file),
"%s/%s",
state.runtime_dir,
DEFAULT_VIDEO_THREAD_FAULT_FILE
);
snprintf(
state.control_thread_fault_file,
sizeof(state.control_thread_fault_file),
"%s/%s",
state.runtime_dir,
DEFAULT_CONTROL_THREAD_FAULT_FILE
);
initial_heartbeat = realtime_epoch_sec();
atomic_init(&state.video_thread_heartbeat_epoch_sec, initial_heartbeat);
atomic_init(&state.control_thread_heartbeat_epoch_sec, initial_heartbeat);
if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' ||
state.control_server_addr == NULL || state.control_server_addr[0] == '\0') {
fprintf(stderr, "OMNISOCKET_SERVER_ADDR (or session-specific overrides) is required\n");
return 1;
}
if (video_pipeline_stats_init(&state.video_stats) != 0) {
perror("video_pipeline_stats_init");
return 1;
}
if (control_bridge_stats_init(&state.control_stats) != 0) {
perror("control_bridge_stats_init");
video_pipeline_stats_destroy(&state.video_stats);
return 1;
}
if (control_ack_manager_init(&state) != 0) {
perror("control_ack_manager_init");
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
return 1;
}
if (unix_dgram_client_init(&state.unix_client, state.control_unix_socket) != 0) {
perror("unix_dgram_client_init");
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
return 1;
}
fprintf(
stderr,
"[b_side_omnid] control forwarding target is unix_dgram://%s\n",
state.control_unix_socket
);
if (install_signal_handler(SIGINT) != 0 || install_signal_handler(SIGTERM) != 0) {
perror("install_signal_handler");
unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
return 1;
}
{
const char *stats_log_path = getenv("BLITZ_KCP_STATS_LOG_PATH");
const char *latency_log_path = getenv("BLITZ_CONTROL_LATENCY_LOG_PATH");
const char *video_stage_log_path = getenv("BLITZ_VIDEO_STAGE_LOG_PATH");
int latency_enabled = env_int_or_default("BLITZ_CONTROL_LATENCY_LOG_ENABLED", 1);
int video_stage_log_enabled = env_int_or_default("BLITZ_VIDEO_STAGE_LOG_ENABLED", 1);
uint64_t video_stage_log_sample_mod = env_u64_or_default("BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD", 10);
if (stats_log_path != NULL && stats_log_path[0] != '\0') {
state.stats_logger = kcp_session_stats_open_jsonl(stats_log_path);
if (state.stats_logger == NULL) {
fprintf(stderr, "[b_side_omnid] warning: failed to open KCP stats log %s\n", stats_log_path);
}
}
if (latency_enabled && latency_log_path != NULL && latency_log_path[0] != '\0') {
state.control_latency_logger = latencylog_open_jsonl(latency_log_path);
if (state.control_latency_logger == NULL) {
fprintf(stderr, "[b_side_omnid] warning: failed to open control latency log %s\n", latency_log_path);
}
}
if (video_stage_log_enabled && video_stage_log_path != NULL && video_stage_log_path[0] != '\0') {
state.video_stage_logger = video_stage_logger_open_jsonl(video_stage_log_path, video_stage_log_sample_mod);
if (state.video_stage_logger == NULL) {
fprintf(stderr, "[b_side_omnid] warning: failed to open video stage log %s\n", video_stage_log_path);
}
}
state.video_config.stats_logger = state.stats_logger;
state.video_config.stage_logger = state.video_stage_logger;
state.video_config.stats_interval_ms = state.stats_interval_ms;
}
if (control_ack_enabled(&state)) {
if (pthread_create(&state.control_ack_thread, NULL, control_ack_thread_main, &state) != 0) {
fprintf(stderr, "[b_side_omnid] warning: failed to start async control ACK manager, ACK sampling disabled\n");
} else {
state.control_ack_thread_started = 1;
}
}
if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) {
perror("pthread_create(video_thread)");
unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
latencylog_close(state.control_latency_logger);
video_stage_logger_close(state.video_stage_logger);
kcp_session_stats_close(state.stats_logger);
return 1;
}
if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) {
perror("pthread_create(control_thread)");
g_stop_requested = 1;
pthread_join(video_thread, NULL);
unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
latencylog_close(state.control_latency_logger);
video_stage_logger_close(state.video_stage_logger);
kcp_session_stats_close(state.stats_logger);
return 1;
}
while (!g_stop_requested) {
sleep(1);
print_stats(&state);
if (write_daemon_status_file(&state) != 0) {
fprintf(stderr, "[b_side_omnid] failed to write status file %s: %s\n", state.status_file_path, strerror(errno));
}
exit_if_thread_stalled(&state);
}
pthread_join(video_thread, NULL);
pthread_join(control_thread, NULL);
unix_dgram_client_close(&state.unix_client);
control_ack_manager_destroy(&state);
control_bridge_stats_destroy(&state.control_stats);
video_pipeline_stats_destroy(&state.video_stats);
latencylog_close(state.control_latency_logger);
video_stage_logger_close(state.video_stage_logger);
kcp_session_stats_close(state.stats_logger);
return 0;
}