#include #include #include #include #include #include #include #include #include #include #include #include #include #include "cJSON.h" #include "control_protocol.h" #include "latencylog.h" #include "protocol.h" #include "video_pipeline.h" #define CONTROL_DEFAULT_PEER_ID "peer-b-ctrl" #define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl" #define CONTROL_ACK_DEFAULT_PEER_ID "peer-b-ctrl-ack" #define CONTROL_ACK_DEFAULT_TARGET_PEER "peer-a-ctrl-ack" #define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock" #define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000 #define DEFAULT_RUNTIME_DIR "/run/blitz-robot" #define DEFAULT_STATUS_FILE_NAME "b-side-omnid.status.json" #define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall" #define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall" #define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15 #define DEFAULT_KCP_STATS_INTERVAL_MS 1000 #define DEFAULT_CONTROL_LATENCY_SAMPLE_MOD 100 #define DEFAULT_CONTROL_ACK_SAMPLE_MOD 10 #define EXIT_CODE_VIDEO_THREAD_STALLED 101 #define EXIT_CODE_CONTROL_THREAD_STALLED 102 typedef struct unix_dgram_client { int fd; char bind_path[108]; char dest_path[108]; struct sockaddr_un dest_addr; socklen_t dest_len; } unix_dgram_client_t; typedef struct control_bridge_stats { pthread_mutex_t mutex; uint64_t packets_forwarded; uint64_t invalid_packets; uint64_t unix_send_errors; uint64_t reconnect_count; uint32_t server_idle_ms; int ever_connected; int registered; char last_error[256]; char last_reconnect_reason[256]; kcp_runtime_stats_t transport; } control_bridge_stats_t; typedef struct daemon_state { volatile sig_atomic_t *stop_requested; video_pipeline_config_t video_config; video_pipeline_stats_t video_stats; const char *control_server_addr; const char *control_relay_via; const char *control_bind_ip; const char *control_bind_device; const char *control_peer_id; const char *control_expected_sender; const char *control_ack_peer_id; const char *control_ack_target_peer; const char *control_unix_socket; int control_server_idle_reconnect_ms; const char *runtime_dir; int heartbeat_timeout_sec; int stats_interval_ms; uint64_t control_latency_sample_mod; uint64_t control_ack_sample_mod; char status_file_path[512]; char video_thread_fault_file[512]; char control_thread_fault_file[512]; atomic_long video_thread_heartbeat_epoch_sec; atomic_long control_thread_heartbeat_epoch_sec; atomic_int control_ack_shutdown_requested; kcp_session_stats_logger_t *stats_logger; latency_logger_t *control_latency_logger; video_stage_logger_t *video_stage_logger; unix_dgram_client_t unix_client; control_bridge_stats_t control_stats; pthread_mutex_t control_ack_mutex; pthread_t control_ack_thread; kcp_client_t *control_ack_client; int control_ack_thread_started; int control_ack_connect_requested; int control_ack_connect_inflight; } daemon_state_t; static volatile sig_atomic_t g_stop_requested = 0; static void handle_signal(int signum) { (void) signum; g_stop_requested = 1; } static int install_signal_handler(int signum) { struct sigaction action; memset(&action, 0, sizeof(action)); action.sa_handler = handle_signal; action.sa_flags = SA_RESTART; if (sigemptyset(&action.sa_mask) != 0) { return -1; } return sigaction(signum, &action, NULL); } static const char *env_or_default(const char *name, const char *fallback) { const char *value = getenv(name); if (value != NULL && value[0] != '\0') { return value; } return fallback; } static const char *env_first_nonempty(const char *first, const char *second, const char *fallback) { const char *value = getenv(first); if (value != NULL && value[0] != '\0') { return value; } value = getenv(second); if (value != NULL && value[0] != '\0') { return value; } return fallback; } static int env_int_or_default(const char *name, int fallback) { const char *value = getenv(name); int parsed; if (value == NULL || value[0] == '\0') { return fallback; } parsed = atoi(value); if (parsed <= 0) { return fallback; } return parsed; } static uint64_t env_u64_or_default(const char *name, uint64_t fallback) { const char *value = getenv(name); unsigned long long parsed = 0ULL; char *endptr = NULL; if (value == NULL || value[0] == '\0') { return fallback; } parsed = strtoull(value, &endptr, 10); if (endptr == value || *endptr != '\0' || parsed == 0ULL) { return fallback; } return (uint64_t) parsed; } static int64_t realtime_epoch_ms(void) { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); return (int64_t) ts.tv_sec * 1000 + ts.tv_nsec / 1000000; } static long realtime_epoch_sec(void) { return (long) time(NULL); } static void update_thread_heartbeat(atomic_long *heartbeat) { if (heartbeat == NULL) { return; } atomic_store(heartbeat, realtime_epoch_sec()); } static int should_log_control_latency(const daemon_state_t *state, const message_t *msg) { uint64_t sample_mod; if (state == NULL || state->control_latency_logger == NULL || msg == NULL) { return 0; } sample_mod = state->control_latency_sample_mod; if (sample_mod <= 1U) { return 1; } return msg->id % sample_mod == 0U; } static int should_send_control_ack(const daemon_state_t *state, const message_t *msg) { uint64_t sample_mod; if (state == NULL || msg == NULL) { return 0; } sample_mod = state->control_ack_sample_mod; if (sample_mod <= 1U) { return 1; } return msg->id % sample_mod == 0U; } static void video_pipeline_heartbeat_progress(void *context) { update_thread_heartbeat((atomic_long *) context); } static int ensure_runtime_dir(const char *runtime_dir) { struct stat st; if (runtime_dir == NULL || runtime_dir[0] == '\0') { errno = EINVAL; return -1; } if (stat(runtime_dir, &st) == 0) { if (S_ISDIR(st.st_mode)) { return 0; } errno = ENOTDIR; return -1; } if (errno != ENOENT) { return -1; } if (mkdir(runtime_dir, 0775) != 0 && errno != EEXIST) { return -1; } return 0; } static int path_exists(const char *path) { return path != NULL && path[0] != '\0' && access(path, F_OK) == 0; } static int consume_fault_flag(const char *path) { if (!path_exists(path)) { return 0; } unlink(path); return 1; } static void maybe_inject_thread_stall(daemon_state_t *state, const char *fault_path, const char *thread_name) { if (state == NULL || fault_path == NULL || thread_name == NULL) { return; } if (!consume_fault_flag(fault_path)) { return; } fprintf( stderr, "[b_side_omnid] fault injection requested for %s thread, sleeping past %d second heartbeat timeout\n", thread_name, state->heartbeat_timeout_sec ); sleep((unsigned int) state->heartbeat_timeout_sec + 2U); } static int control_bridge_stats_init(control_bridge_stats_t *stats) { int rc; if (stats == NULL) { errno = EINVAL; return -1; } memset(stats, 0, sizeof(*stats)); rc = pthread_mutex_init(&stats->mutex, NULL); if (rc != 0) { errno = rc; return -1; } return 0; } static void control_bridge_stats_destroy(control_bridge_stats_t *stats) { if (stats == NULL) { return; } pthread_mutex_destroy(&stats->mutex); } static void unix_dgram_client_close(unix_dgram_client_t *client); static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats); static void close_control_ack_client(kcp_client_t **client_ptr); static int control_ack_enabled(const daemon_state_t *state) { return state != NULL && state->control_ack_peer_id != NULL && state->control_ack_peer_id[0] != '\0' && state->control_ack_target_peer != NULL && state->control_ack_target_peer[0] != '\0'; } static int control_ack_manager_init(daemon_state_t *state) { int rc; if (state == NULL) { errno = EINVAL; return -1; } rc = pthread_mutex_init(&state->control_ack_mutex, NULL); if (rc != 0) { errno = rc; return -1; } atomic_init(&state->control_ack_shutdown_requested, 0); state->control_ack_client = NULL; state->control_ack_thread_started = 0; state->control_ack_connect_requested = 0; state->control_ack_connect_inflight = 0; return 0; } static void control_ack_manager_reset(daemon_state_t *state, int request_connect) { kcp_client_t *client = NULL; if (state == NULL) { return; } pthread_mutex_lock(&state->control_ack_mutex); client = state->control_ack_client; state->control_ack_client = NULL; state->control_ack_connect_requested = request_connect && control_ack_enabled(state) && state->control_ack_thread_started; pthread_mutex_unlock(&state->control_ack_mutex); close_control_ack_client(&client); } static void control_ack_manager_destroy(daemon_state_t *state) { if (state == NULL) { return; } atomic_store(&state->control_ack_shutdown_requested, 1); if (state->control_ack_thread_started) { pthread_join(state->control_ack_thread, NULL); state->control_ack_thread_started = 0; } control_ack_manager_reset(state, 0); pthread_mutex_destroy(&state->control_ack_mutex); } static int write_status_json_atomic(const char *path, cJSON *root) { char *json; char temp_path[640]; FILE *file; size_t json_len; if (path == NULL || root == NULL) { errno = EINVAL; return -1; } json = cJSON_PrintUnformatted(root); if (json == NULL) { errno = ENOMEM; return -1; } snprintf(temp_path, sizeof(temp_path), "%s.tmp.%ld", path, (long) getpid()); file = fopen(temp_path, "wb"); if (file == NULL) { cJSON_free(json); return -1; } json_len = strlen(json); if (fwrite(json, 1, json_len, file) != json_len || fflush(file) != 0) { int saved_errno = errno; fclose(file); unlink(temp_path); cJSON_free(json); errno = saved_errno; return -1; } if (fclose(file) != 0) { int saved_errno = errno; unlink(temp_path); cJSON_free(json); errno = saved_errno; return -1; } if (rename(temp_path, path) != 0) { int saved_errno = errno; unlink(temp_path); cJSON_free(json); errno = saved_errno; return -1; } cJSON_free(json); return 0; } static int write_daemon_status_file(daemon_state_t *state) { cJSON *root; video_pipeline_stats_t video_stats; control_bridge_stats_t control_stats; int rc; if (state == NULL) { errno = EINVAL; return -1; } if (ensure_runtime_dir(state->runtime_dir) != 0) { return -1; } memset(&video_stats, 0, sizeof(video_stats)); memset(&control_stats, 0, sizeof(control_stats)); video_pipeline_stats_snapshot(&state->video_stats, &video_stats); control_bridge_stats_snapshot(&state->control_stats, &control_stats); root = cJSON_CreateObject(); if (root == NULL) { errno = ENOMEM; return -1; } cJSON_AddNumberToObject(root, "updated_at_epoch_ms", (double) realtime_epoch_ms()); cJSON_AddNumberToObject(root, "pid", (double) getpid()); cJSON_AddNumberToObject(root, "video_thread_heartbeat_epoch_ms", (double) atomic_load(&state->video_thread_heartbeat_epoch_sec) * 1000.0); cJSON_AddNumberToObject(root, "control_thread_heartbeat_epoch_ms", (double) atomic_load(&state->control_thread_heartbeat_epoch_sec) * 1000.0); cJSON_AddBoolToObject(root, "video_connected", video_stats.connected != 0); cJSON_AddNumberToObject(root, "video_frames_sent", (double) video_stats.frames_sent); cJSON_AddNumberToObject(root, "video_send_errors", (double) video_stats.send_errors); cJSON_AddNumberToObject(root, "video_backlog_resets", (double) video_stats.backlog_resets); cJSON_AddNumberToObject(root, "video_last_capture_to_send_ms", (double) video_stats.last_capture_to_send_ms); cJSON_AddNumberToObject(root, "video_avg_capture_to_send_ms", video_stats.avg_capture_to_send_ms); cJSON_AddStringToObject(root, "video_last_error", video_stats.last_error); cJSON_AddBoolToObject(root, "control_registered", control_stats.registered != 0); cJSON_AddNumberToObject(root, "control_reconnect_count", (double) control_stats.reconnect_count); cJSON_AddNumberToObject(root, "control_unix_send_errors", (double) control_stats.unix_send_errors); cJSON_AddStringToObject(root, "control_last_error", control_stats.last_error); rc = write_status_json_atomic(state->status_file_path, root); cJSON_Delete(root); return rc; } static int thread_heartbeat_expired(atomic_long *heartbeat, int timeout_sec, long now_sec) { long heartbeat_sec; if (heartbeat == NULL || timeout_sec <= 0) { return 0; } heartbeat_sec = atomic_load(heartbeat); if (heartbeat_sec <= 0) { return 0; } return now_sec - heartbeat_sec > timeout_sec; } static void exit_if_thread_stalled(daemon_state_t *state) { long now_sec; if (state == NULL || state->heartbeat_timeout_sec <= 0) { return; } now_sec = realtime_epoch_sec(); if (thread_heartbeat_expired(&state->video_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) { fprintf(stderr, "[b_side_omnid] video thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec); fflush(stderr); exit(EXIT_CODE_VIDEO_THREAD_STALLED); } if (thread_heartbeat_expired(&state->control_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) { fprintf(stderr, "[b_side_omnid] control thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec); fflush(stderr); exit(EXIT_CODE_CONTROL_THREAD_STALLED); } } static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) { if (stats == NULL) { return; } pthread_mutex_lock(&stats->mutex); snprintf(stats->last_error, sizeof(stats->last_error), "%s", message == NULL ? "" : message); pthread_mutex_unlock(&stats->mutex); } static void control_bridge_set_reconnect_reason(control_bridge_stats_t *stats, const char *message) { if (stats == NULL) { return; } pthread_mutex_lock(&stats->mutex); snprintf(stats->last_reconnect_reason, sizeof(stats->last_reconnect_reason), "%s", message == NULL ? "" : message); pthread_mutex_unlock(&stats->mutex); } static void control_bridge_set_errno_error(control_bridge_stats_t *stats, const char *prefix) { char buffer[256]; int saved_errno = errno; snprintf( buffer, sizeof(buffer), "%s: %s (errno=%d)", prefix == NULL ? "control bridge error" : prefix, saved_errno != 0 ? strerror(saved_errno) : "unknown error", saved_errno ); control_bridge_set_error(stats, buffer); } static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats) { if (stats == NULL || out_stats == NULL) { return; } memset(out_stats, 0, sizeof(*out_stats)); pthread_mutex_lock(&stats->mutex); out_stats->packets_forwarded = stats->packets_forwarded; out_stats->invalid_packets = stats->invalid_packets; out_stats->unix_send_errors = stats->unix_send_errors; out_stats->reconnect_count = stats->reconnect_count; out_stats->server_idle_ms = stats->server_idle_ms; out_stats->registered = stats->registered; snprintf(out_stats->last_error, sizeof(out_stats->last_error), "%s", stats->last_error); snprintf(out_stats->last_reconnect_reason, sizeof(out_stats->last_reconnect_reason), "%s", stats->last_reconnect_reason); out_stats->transport = stats->transport; pthread_mutex_unlock(&stats->mutex); } static int control_server_error_requires_reconnect(const char *message) { if (message == NULL || message[0] == '\0') { return 0; } return strstr(message, "not registered") != NULL || strstr(message, "first message must be register") != NULL || strstr(message, "peer replaced") != NULL || strstr(message, "timed out waiting for server_register_ok") != NULL; } static void control_message_body_to_cstr(const message_t *msg, char *buffer, size_t buffer_len) { size_t copy_len; if (buffer == NULL || buffer_len == 0) { return; } buffer[0] = '\0'; if (msg == NULL || msg->body == NULL || msg->body_len == 0) { return; } copy_len = msg->body_len < (buffer_len - 1U) ? msg->body_len : (buffer_len - 1U); memcpy(buffer, msg->body, copy_len); buffer[copy_len] = '\0'; } static kcp_client_t *connect_control_ack_client(const daemon_state_t *state) { kcp_conn_options_t options; if (state == NULL || state->control_ack_peer_id == NULL || state->control_ack_peer_id[0] == '\0') { errno = EINVAL; return NULL; } kcp_conn_options_set_control_defaults(&options); return kcp_client_dial_with_options( state->control_server_addr, state->control_relay_via, state->control_ack_peer_id, state->control_bind_ip, state->control_bind_device, &options, NULL, NULL, state->stats_logger, state->stats_interval_ms ); } static void close_control_ack_client(kcp_client_t **client_ptr) { if (client_ptr == NULL || *client_ptr == NULL) { return; } kcp_client_close(*client_ptr); kcp_client_free(*client_ptr); *client_ptr = NULL; } static void control_ack_manager_request_connect(daemon_state_t *state) { if (state == NULL || !control_ack_enabled(state) || !state->control_ack_thread_started) { return; } pthread_mutex_lock(&state->control_ack_mutex); if (state->control_ack_client == NULL) { state->control_ack_connect_requested = 1; } pthread_mutex_unlock(&state->control_ack_mutex); } static void *control_ack_thread_main(void *arg) { daemon_state_t *state = (daemon_state_t *) arg; while (!atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested) { kcp_client_t *client = NULL; int connect_failed = 0; int should_connect = 0; pthread_mutex_lock(&state->control_ack_mutex); if (state->control_ack_connect_requested && state->control_ack_client == NULL && !state->control_ack_connect_inflight) { state->control_ack_connect_inflight = 1; should_connect = 1; } pthread_mutex_unlock(&state->control_ack_mutex); if (!should_connect) { usleep(200000); continue; } client = connect_control_ack_client(state); connect_failed = client == NULL; pthread_mutex_lock(&state->control_ack_mutex); state->control_ack_connect_inflight = 0; if ( client != NULL && state->control_ack_connect_requested && state->control_ack_client == NULL && !atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested ) { state->control_ack_client = client; state->control_ack_connect_requested = 0; client = NULL; } pthread_mutex_unlock(&state->control_ack_mutex); if (client != NULL) { close_control_ack_client(&client); } if (connect_failed && !atomic_load(&state->control_ack_shutdown_requested) && !*state->stop_requested) { sleep(1); } } return NULL; } static void maybe_send_control_ack( daemon_state_t *state, const message_t *msg, int64_t recv_unix_nano, int64_t persist_end_unix_nano, const char *sample_reason ) { kcp_client_t *ack_client = NULL; kcp_client_t *client_to_close = NULL; char *payload = NULL; int send_rc = -1; if ( state == NULL || msg == NULL || recv_unix_nano <= 0 || persist_end_unix_nano <= recv_unix_nano || !control_ack_enabled(state) || !state->control_ack_thread_started ) { return; } payload = omni_strdup_printf( "{\"message_id\":%" PRIu64 ",\"ack_phase\":\"persist_end\",\"b_recv_to_persist_us\":%" PRId64 ",\"unix_send_ok\":true,\"sample_reason\":\"%s\"}", msg->id, (persist_end_unix_nano - recv_unix_nano) / 1000, sample_reason == NULL ? "sample_mod" : sample_reason ); if (payload == NULL) { return; } pthread_mutex_lock(&state->control_ack_mutex); ack_client = state->control_ack_client; if (ack_client == NULL) { state->control_ack_connect_requested = 1; pthread_mutex_unlock(&state->control_ack_mutex); free(payload); return; } send_rc = kcp_client_send_text(ack_client, state->control_ack_target_peer, payload); if (send_rc != 0) { client_to_close = state->control_ack_client; state->control_ack_client = NULL; state->control_ack_connect_requested = 1; } pthread_mutex_unlock(&state->control_ack_mutex); free(payload); if (client_to_close != NULL) { close_control_ack_client(&client_to_close); } } static int unix_dgram_client_init(unix_dgram_client_t *client, const char *dest_path) { struct sockaddr_un bind_addr; pid_t pid; if (client == NULL || dest_path == NULL || dest_path[0] == '\0') { errno = EINVAL; return -1; } memset(client, 0, sizeof(*client)); client->fd = socket(AF_UNIX, SOCK_DGRAM, 0); if (client->fd < 0) { return -1; } memset(&bind_addr, 0, sizeof(bind_addr)); bind_addr.sun_family = AF_UNIX; pid = getpid(); snprintf(client->bind_path, sizeof(client->bind_path), "/tmp/omnisocket-b-side-cmd-client-%ld.sock", (long) pid); unlink(client->bind_path); snprintf(bind_addr.sun_path, sizeof(bind_addr.sun_path), "%s", client->bind_path); if (bind(client->fd, (const struct sockaddr *) &bind_addr, sizeof(bind_addr)) != 0) { close(client->fd); unlink(client->bind_path); client->fd = -1; return -1; } memset(&client->dest_addr, 0, sizeof(client->dest_addr)); client->dest_addr.sun_family = AF_UNIX; snprintf(client->dest_path, sizeof(client->dest_path), "%s", dest_path); snprintf(client->dest_addr.sun_path, sizeof(client->dest_addr.sun_path), "%s", dest_path); client->dest_len = (socklen_t) sizeof(client->dest_addr); return 0; } static int unix_dgram_client_send(unix_dgram_client_t *client, const void *data, size_t len) { ssize_t written; if (client == NULL || client->fd < 0 || (data == NULL && len > 0)) { errno = EINVAL; return -1; } written = sendto(client->fd, data, len, 0, (const struct sockaddr *) &client->dest_addr, client->dest_len); if (written < 0 || (size_t) written != len) { if (written >= 0) { errno = EIO; } return -1; } return 0; } static int unix_dgram_client_reopen(unix_dgram_client_t *client) { char dest_path[sizeof(client->dest_path)]; if (client == NULL || client->dest_path[0] == '\0') { errno = EINVAL; return -1; } snprintf(dest_path, sizeof(dest_path), "%s", client->dest_path); unix_dgram_client_close(client); return unix_dgram_client_init(client, dest_path); } static int unix_dgram_client_should_reopen(int error_code) { return error_code == ENOENT || error_code == ECONNREFUSED || error_code == EBADF || error_code == ENOTCONN; } static void unix_dgram_client_close(unix_dgram_client_t *client) { if (client == NULL) { return; } if (client->fd >= 0) { close(client->fd); client->fd = -1; } if (client->bind_path[0] != '\0') { unlink(client->bind_path); client->bind_path[0] = '\0'; } } static void *video_thread_main(void *arg) { daemon_state_t *state = (daemon_state_t *) arg; while (!*state->stop_requested) { update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec); maybe_inject_thread_stall(state, state->video_thread_fault_file, "video"); int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested); update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec); if (video_rc == 0) { break; } if (video_rc == VIDEO_PIPELINE_RUN_RETRY_IMMEDIATE) { continue; } if (!*state->stop_requested) { sleep(1); } } return NULL; } static void *control_thread_main(void *arg) { daemon_state_t *state = (daemon_state_t *) arg; while (!*state->stop_requested) { kcp_conn_options_t options; kcp_client_t *client = NULL; int reconnect_immediately = 0; update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec); maybe_inject_thread_stall(state, state->control_thread_fault_file, "control"); kcp_conn_options_set_control_defaults(&options); client = kcp_client_dial_with_options( state->control_server_addr, state->control_relay_via, state->control_peer_id, state->control_bind_ip, state->control_bind_device, &options, NULL, NULL, state->stats_logger, state->stats_interval_ms ); if (client == NULL) { control_bridge_set_errno_error(&state->control_stats, "failed to connect control session"); sleep(1); continue; } { kcp_client_state_t client_state; memset(&client_state, 0, sizeof(client_state)); kcp_client_state_snapshot(client, &client_state); pthread_mutex_lock(&state->control_stats.mutex); if (state->control_stats.ever_connected) { state->control_stats.reconnect_count += 1; } else { state->control_stats.ever_connected = 1; } state->control_stats.registered = client_state.registered; state->control_stats.server_idle_ms = client_state.server_idle_ms; state->control_stats.last_reconnect_reason[0] = '\0'; snprintf(state->control_stats.last_error, sizeof(state->control_stats.last_error), "%s", client_state.last_server_error); kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); } control_ack_manager_request_connect(state); while (!*state->stop_requested) { message_t msg; int rc; kcp_client_state_t client_state; int ack_sampled = 0; int log_control_latency = 0; int64_t recv_unix_nano = 0; int64_t persist_begin_unix_nano = 0; int64_t persist_end_unix_nano = 0; update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec); protocol_message_init(&msg); rc = kcp_client_receive_timed(client, &msg, 100); update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec); if (rc == 1) { char reconnect_reason[256]; protocol_message_clear(&msg); memset(&client_state, 0, sizeof(client_state)); kcp_client_state_snapshot(client, &client_state); pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.registered = client_state.registered; state->control_stats.server_idle_ms = client_state.server_idle_ms; snprintf(state->control_stats.last_error, sizeof(state->control_stats.last_error), "%s", client_state.last_server_error); kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); if (!client_state.registered) { snprintf(reconnect_reason, sizeof(reconnect_reason), "control session stale: server reported unregistered"); } else if ( state->control_server_idle_reconnect_ms > 0 && client_state.server_idle_ms >= (uint32_t) state->control_server_idle_reconnect_ms ) { snprintf( reconnect_reason, sizeof(reconnect_reason), "control session stale: server idle timeout (%u ms >= %d ms)", client_state.server_idle_ms, state->control_server_idle_reconnect_ms ); } else if (control_server_error_requires_reconnect(client_state.last_server_error)) { snprintf( reconnect_reason, sizeof(reconnect_reason), "control session stale: server error %.180s", client_state.last_server_error ); } else { reconnect_reason[0] = '\0'; } if (reconnect_reason[0] != '\0') { control_bridge_set_error(&state->control_stats, reconnect_reason); control_bridge_set_reconnect_reason(&state->control_stats, reconnect_reason); fprintf(stderr, "[b_side_omnid] %s\n", reconnect_reason); reconnect_immediately = 1; break; } continue; } if (rc != 0) { memset(&client_state, 0, sizeof(client_state)); kcp_client_state_snapshot(client, &client_state); pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.registered = client_state.registered; state->control_stats.server_idle_ms = client_state.server_idle_ms; kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); if (client_state.last_server_error[0] != '\0') { control_bridge_set_error(&state->control_stats, client_state.last_server_error); if (control_server_error_requires_reconnect(client_state.last_server_error)) { control_bridge_set_reconnect_reason(&state->control_stats, client_state.last_server_error); reconnect_immediately = 1; } } else { control_bridge_set_errno_error(&state->control_stats, "control receive loop stopped"); } protocol_message_clear(&msg); break; } if (msg.type == MSG_TYPE_ERROR && strcmp(msg.from, SERVER_PEER_ID) == 0) { char server_error[256]; control_message_body_to_cstr(&msg, server_error, sizeof(server_error)); control_bridge_set_error(&state->control_stats, server_error); if (control_server_error_requires_reconnect(server_error)) { char reconnect_reason[256]; snprintf(reconnect_reason, sizeof(reconnect_reason), "control session stale: server error %.180s", server_error); control_bridge_set_reconnect_reason(&state->control_stats, reconnect_reason); fprintf(stderr, "[b_side_omnid] %s\n", reconnect_reason); reconnect_immediately = 1; protocol_message_clear(&msg); break; } protocol_message_clear(&msg); continue; } if (state->control_expected_sender[0] != '\0' && strcmp(msg.from, state->control_expected_sender) != 0) { pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.invalid_packets += 1; pthread_mutex_unlock(&state->control_stats.mutex); protocol_message_clear(&msg); continue; } if (msg.type != MSG_TYPE_BINARY || msg.body_len != OMNI_CONTROL_PACKET_SIZE) { pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.invalid_packets += 1; pthread_mutex_unlock(&state->control_stats.mutex); protocol_message_clear(&msg); continue; } ack_sampled = should_send_control_ack(state, &msg); log_control_latency = ack_sampled || should_log_control_latency(state, &msg); if (log_control_latency) { recv_unix_nano = omni_now_unix_nano(); persist_begin_unix_nano = recv_unix_nano; latencylog_log_message_event_at( state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_APP_RECV, recv_unix_nano, &msg ); latencylog_log_message_event_at( state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_BEGIN, persist_begin_unix_nano, &msg ); } if (unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) != 0) { int send_errno = errno; int recovered = 0; if (unix_dgram_client_should_reopen(send_errno) && unix_dgram_client_reopen(&state->unix_client) == 0) { recovered = unix_dgram_client_send(&state->unix_client, msg.body, msg.body_len) == 0; } if (recovered) { memset(&client_state, 0, sizeof(client_state)); kcp_client_state_snapshot(client, &client_state); pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.packets_forwarded += 1; state->control_stats.registered = client_state.registered; state->control_stats.server_idle_ms = client_state.server_idle_ms; kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); if (log_control_latency) { persist_end_unix_nano = omni_now_unix_nano(); latencylog_log_message_event_at( state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_END, persist_end_unix_nano, &msg ); } if (ack_sampled) { maybe_send_control_ack(state, &msg, recv_unix_nano, persist_end_unix_nano, "sample_mod"); } protocol_message_clear(&msg); continue; } errno = send_errno; pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.unix_send_errors += 1; pthread_mutex_unlock(&state->control_stats.mutex); control_bridge_set_errno_error(&state->control_stats, "failed to forward command to unix socket"); protocol_message_clear(&msg); continue; } memset(&client_state, 0, sizeof(client_state)); kcp_client_state_snapshot(client, &client_state); pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.packets_forwarded += 1; state->control_stats.registered = client_state.registered; state->control_stats.server_idle_ms = client_state.server_idle_ms; kcp_client_runtime_stats_snapshot(client, &state->control_stats.transport); pthread_mutex_unlock(&state->control_stats.mutex); if (log_control_latency) { persist_end_unix_nano = omni_now_unix_nano(); latencylog_log_message_event_at( state->control_latency_logger, OMNI_NODE_ROLE_PEER, state->control_peer_id, EVENT_B_PERSIST_END, persist_end_unix_nano, &msg ); } if (ack_sampled) { maybe_send_control_ack(state, &msg, recv_unix_nano, persist_end_unix_nano, "sample_mod"); } protocol_message_clear(&msg); } pthread_mutex_lock(&state->control_stats.mutex); state->control_stats.registered = 0; state->control_stats.server_idle_ms = 0; pthread_mutex_unlock(&state->control_stats.mutex); control_ack_manager_reset(state, 0); kcp_client_close(client); kcp_client_free(client); if (!*state->stop_requested && !reconnect_immediately) { sleep(1); } } return NULL; } static void print_stats(daemon_state_t *state) { video_pipeline_stats_t video_stats; control_bridge_stats_t control_stats; memset(&video_stats, 0, sizeof(video_stats)); memset(&control_stats, 0, sizeof(control_stats)); video_pipeline_stats_snapshot(&state->video_stats, &video_stats); control_bridge_stats_snapshot(&state->control_stats, &control_stats); fprintf( stderr, "[b_side_omnid] video registered=%d frames=%llu bytes=%llu drops=%llu resets=%llu backlog=%u cap2send=%ums avg=%.1fms reason=%s srtt=%dms | control registered=%d idle=%ums reconnects=%llu forwarded=%llu invalid=%llu unix_err=%llu srtt=%dms last_reconnect=%s\n", video_stats.connected, (unsigned long long) video_stats.frames_sent, (unsigned long long) video_stats.bytes_sent, (unsigned long long) video_stats.backpressure_drops, (unsigned long long) video_stats.backlog_resets, video_stats.last_backlog_segments, video_stats.last_capture_to_send_ms, video_stats.avg_capture_to_send_ms, video_stats.last_backlog_reason[0] == '\0' ? "-" : video_stats.last_backlog_reason, video_stats.transport.srtt_ms, control_stats.registered, control_stats.server_idle_ms, (unsigned long long) control_stats.reconnect_count, (unsigned long long) control_stats.packets_forwarded, (unsigned long long) control_stats.invalid_packets, (unsigned long long) control_stats.unix_send_errors, control_stats.transport.srtt_ms, control_stats.last_reconnect_reason[0] == '\0' ? "-" : control_stats.last_reconnect_reason ); } int main(void) { daemon_state_t state; pthread_t video_thread; pthread_t control_thread; long initial_heartbeat; memset(&state, 0, sizeof(state)); state.stop_requested = &g_stop_requested; video_pipeline_config_init(&state.video_config); video_pipeline_config_load_env(&state.video_config); state.control_server_addr = env_first_nonempty("OMNI_CONTROL_SERVER_ADDR", "OMNISOCKET_SERVER_ADDR", ""); state.control_relay_via = env_first_nonempty("OMNI_CONTROL_RELAY_VIA", "OMNISOCKET_RELAY_VIA", ""); state.control_bind_ip = env_first_nonempty("OMNI_CONTROL_BIND_IP", "OMNISOCKET_BIND_IP", ""); state.control_bind_device = env_first_nonempty("OMNI_CONTROL_BIND_DEVICE", "OMNISOCKET_BIND_DEVICE", ""); state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID); state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER); state.control_ack_peer_id = env_or_default("OMNI_CONTROL_ACK_PEER_ID", CONTROL_ACK_DEFAULT_PEER_ID); state.control_ack_target_peer = env_or_default("OMNI_CONTROL_ACK_TARGET_PEER", CONTROL_ACK_DEFAULT_TARGET_PEER); state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET); state.runtime_dir = env_or_default("BLITZ_RUNTIME_DIR", DEFAULT_RUNTIME_DIR); state.heartbeat_timeout_sec = env_int_or_default( "BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC", DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC ); state.stats_interval_ms = env_int_or_default("BLITZ_KCP_STATS_INTERVAL_MS", DEFAULT_KCP_STATS_INTERVAL_MS); state.control_latency_sample_mod = env_u64_or_default("BLITZ_CONTROL_LATENCY_LOG_SAMPLE_MOD", DEFAULT_CONTROL_LATENCY_SAMPLE_MOD); state.control_ack_sample_mod = env_u64_or_default("BLITZ_CONTROL_ACK_SAMPLE_MOD", DEFAULT_CONTROL_ACK_SAMPLE_MOD); state.video_config.progress_callback = video_pipeline_heartbeat_progress; state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec; state.video_config.stats_logger = NULL; state.video_config.stage_logger = NULL; state.video_config.stats_interval_ms = state.stats_interval_ms; state.control_server_idle_reconnect_ms = env_int_or_default( "OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS", CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS ); snprintf(state.status_file_path, sizeof(state.status_file_path), "%s/%s", state.runtime_dir, DEFAULT_STATUS_FILE_NAME); snprintf( state.video_thread_fault_file, sizeof(state.video_thread_fault_file), "%s/%s", state.runtime_dir, DEFAULT_VIDEO_THREAD_FAULT_FILE ); snprintf( state.control_thread_fault_file, sizeof(state.control_thread_fault_file), "%s/%s", state.runtime_dir, DEFAULT_CONTROL_THREAD_FAULT_FILE ); initial_heartbeat = realtime_epoch_sec(); atomic_init(&state.video_thread_heartbeat_epoch_sec, initial_heartbeat); atomic_init(&state.control_thread_heartbeat_epoch_sec, initial_heartbeat); if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' || state.control_server_addr == NULL || state.control_server_addr[0] == '\0') { fprintf(stderr, "OMNISOCKET_SERVER_ADDR (or session-specific overrides) is required\n"); return 1; } if (video_pipeline_stats_init(&state.video_stats) != 0) { perror("video_pipeline_stats_init"); return 1; } if (control_bridge_stats_init(&state.control_stats) != 0) { perror("control_bridge_stats_init"); video_pipeline_stats_destroy(&state.video_stats); return 1; } if (control_ack_manager_init(&state) != 0) { perror("control_ack_manager_init"); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); return 1; } if (unix_dgram_client_init(&state.unix_client, state.control_unix_socket) != 0) { perror("unix_dgram_client_init"); control_ack_manager_destroy(&state); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); return 1; } fprintf( stderr, "[b_side_omnid] control forwarding target is unix_dgram://%s\n", state.control_unix_socket ); if (install_signal_handler(SIGINT) != 0 || install_signal_handler(SIGTERM) != 0) { perror("install_signal_handler"); unix_dgram_client_close(&state.unix_client); control_ack_manager_destroy(&state); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); return 1; } { const char *stats_log_path = getenv("BLITZ_KCP_STATS_LOG_PATH"); const char *latency_log_path = getenv("BLITZ_CONTROL_LATENCY_LOG_PATH"); const char *video_stage_log_path = getenv("BLITZ_VIDEO_STAGE_LOG_PATH"); int latency_enabled = env_int_or_default("BLITZ_CONTROL_LATENCY_LOG_ENABLED", 1); int video_stage_log_enabled = env_int_or_default("BLITZ_VIDEO_STAGE_LOG_ENABLED", 1); uint64_t video_stage_log_sample_mod = env_u64_or_default("BLITZ_VIDEO_STAGE_LOG_SAMPLE_MOD", 10); if (stats_log_path != NULL && stats_log_path[0] != '\0') { state.stats_logger = kcp_session_stats_open_jsonl(stats_log_path); if (state.stats_logger == NULL) { fprintf(stderr, "[b_side_omnid] warning: failed to open KCP stats log %s\n", stats_log_path); } } if (latency_enabled && latency_log_path != NULL && latency_log_path[0] != '\0') { state.control_latency_logger = latencylog_open_jsonl(latency_log_path); if (state.control_latency_logger == NULL) { fprintf(stderr, "[b_side_omnid] warning: failed to open control latency log %s\n", latency_log_path); } } if (video_stage_log_enabled && video_stage_log_path != NULL && video_stage_log_path[0] != '\0') { state.video_stage_logger = video_stage_logger_open_jsonl(video_stage_log_path, video_stage_log_sample_mod); if (state.video_stage_logger == NULL) { fprintf(stderr, "[b_side_omnid] warning: failed to open video stage log %s\n", video_stage_log_path); } } state.video_config.stats_logger = state.stats_logger; state.video_config.stage_logger = state.video_stage_logger; state.video_config.stats_interval_ms = state.stats_interval_ms; } if (control_ack_enabled(&state)) { if (pthread_create(&state.control_ack_thread, NULL, control_ack_thread_main, &state) != 0) { fprintf(stderr, "[b_side_omnid] warning: failed to start async control ACK manager, ACK sampling disabled\n"); } else { state.control_ack_thread_started = 1; } } if (pthread_create(&video_thread, NULL, video_thread_main, &state) != 0) { perror("pthread_create(video_thread)"); unix_dgram_client_close(&state.unix_client); control_ack_manager_destroy(&state); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); latencylog_close(state.control_latency_logger); video_stage_logger_close(state.video_stage_logger); kcp_session_stats_close(state.stats_logger); return 1; } if (pthread_create(&control_thread, NULL, control_thread_main, &state) != 0) { perror("pthread_create(control_thread)"); g_stop_requested = 1; pthread_join(video_thread, NULL); unix_dgram_client_close(&state.unix_client); control_ack_manager_destroy(&state); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); latencylog_close(state.control_latency_logger); video_stage_logger_close(state.video_stage_logger); kcp_session_stats_close(state.stats_logger); return 1; } while (!g_stop_requested) { sleep(1); print_stats(&state); if (write_daemon_status_file(&state) != 0) { fprintf(stderr, "[b_side_omnid] failed to write status file %s: %s\n", state.status_file_path, strerror(errno)); } exit_if_thread_stalled(&state); } pthread_join(video_thread, NULL); pthread_join(control_thread, NULL); unix_dgram_client_close(&state.unix_client); control_ack_manager_destroy(&state); control_bridge_stats_destroy(&state.control_stats); video_pipeline_stats_destroy(&state.video_stats); latencylog_close(state.control_latency_logger); video_stage_logger_close(state.video_stage_logger); kcp_session_stats_close(state.stats_logger); return 0; }