feat: 自启动与自恢复机制
This commit is contained in:
@@ -3,12 +3,16 @@
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "cJSON.h"
|
||||
#include "control_protocol.h"
|
||||
#include "protocol.h"
|
||||
#include "video_pipeline.h"
|
||||
@@ -17,6 +21,13 @@
|
||||
#define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl"
|
||||
#define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock"
|
||||
#define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000
|
||||
#define DEFAULT_RUNTIME_DIR "/run/blitz-robot"
|
||||
#define DEFAULT_STATUS_FILE_NAME "b-side-omnid.status.json"
|
||||
#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall"
|
||||
#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall"
|
||||
#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15
|
||||
#define EXIT_CODE_VIDEO_THREAD_STALLED 101
|
||||
#define EXIT_CODE_CONTROL_THREAD_STALLED 102
|
||||
|
||||
typedef struct unix_dgram_client {
|
||||
int fd;
|
||||
@@ -52,6 +63,13 @@ typedef struct daemon_state {
|
||||
const char *control_expected_sender;
|
||||
const char *control_unix_socket;
|
||||
int control_server_idle_reconnect_ms;
|
||||
const char *runtime_dir;
|
||||
int heartbeat_timeout_sec;
|
||||
char status_file_path[512];
|
||||
char video_thread_fault_file[512];
|
||||
char control_thread_fault_file[512];
|
||||
atomic_long video_thread_heartbeat_epoch_sec;
|
||||
atomic_long control_thread_heartbeat_epoch_sec;
|
||||
unix_dgram_client_t unix_client;
|
||||
control_bridge_stats_t control_stats;
|
||||
} daemon_state_t;
|
||||
@@ -109,6 +127,79 @@ static int env_int_or_default(const char *name, int fallback) {
|
||||
return parsed;
|
||||
}
|
||||
|
||||
static int64_t realtime_epoch_ms(void) {
|
||||
struct timespec ts;
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
return (int64_t) ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
|
||||
}
|
||||
|
||||
static long realtime_epoch_sec(void) {
|
||||
return (long) time(NULL);
|
||||
}
|
||||
|
||||
static void update_thread_heartbeat(atomic_long *heartbeat) {
|
||||
if (heartbeat == NULL) {
|
||||
return;
|
||||
}
|
||||
atomic_store(heartbeat, realtime_epoch_sec());
|
||||
}
|
||||
|
||||
static void video_pipeline_heartbeat_progress(void *context) {
|
||||
update_thread_heartbeat((atomic_long *) context);
|
||||
}
|
||||
|
||||
static int ensure_runtime_dir(const char *runtime_dir) {
|
||||
struct stat st;
|
||||
|
||||
if (runtime_dir == NULL || runtime_dir[0] == '\0') {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
if (stat(runtime_dir, &st) == 0) {
|
||||
if (S_ISDIR(st.st_mode)) {
|
||||
return 0;
|
||||
}
|
||||
errno = ENOTDIR;
|
||||
return -1;
|
||||
}
|
||||
if (errno != ENOENT) {
|
||||
return -1;
|
||||
}
|
||||
if (mkdir(runtime_dir, 0775) != 0 && errno != EEXIST) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int path_exists(const char *path) {
|
||||
return path != NULL && path[0] != '\0' && access(path, F_OK) == 0;
|
||||
}
|
||||
|
||||
static int consume_fault_flag(const char *path) {
|
||||
if (!path_exists(path)) {
|
||||
return 0;
|
||||
}
|
||||
unlink(path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void maybe_inject_thread_stall(daemon_state_t *state, const char *fault_path, const char *thread_name) {
|
||||
if (state == NULL || fault_path == NULL || thread_name == NULL) {
|
||||
return;
|
||||
}
|
||||
if (!consume_fault_flag(fault_path)) {
|
||||
return;
|
||||
}
|
||||
fprintf(
|
||||
stderr,
|
||||
"[b_side_omnid] fault injection requested for %s thread, sleeping past %d second heartbeat timeout\n",
|
||||
thread_name,
|
||||
state->heartbeat_timeout_sec
|
||||
);
|
||||
sleep((unsigned int) state->heartbeat_timeout_sec + 2U);
|
||||
}
|
||||
|
||||
static int control_bridge_stats_init(control_bridge_stats_t *stats) {
|
||||
int rc;
|
||||
if (stats == NULL) {
|
||||
@@ -132,6 +223,138 @@ static void control_bridge_stats_destroy(control_bridge_stats_t *stats) {
|
||||
}
|
||||
|
||||
static void unix_dgram_client_close(unix_dgram_client_t *client);
|
||||
static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats);
|
||||
|
||||
static int write_status_json_atomic(const char *path, cJSON *root) {
|
||||
char *json;
|
||||
char temp_path[640];
|
||||
FILE *file;
|
||||
size_t json_len;
|
||||
|
||||
if (path == NULL || root == NULL) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
json = cJSON_PrintUnformatted(root);
|
||||
if (json == NULL) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
|
||||
snprintf(temp_path, sizeof(temp_path), "%s.tmp.%ld", path, (long) getpid());
|
||||
file = fopen(temp_path, "wb");
|
||||
if (file == NULL) {
|
||||
cJSON_free(json);
|
||||
return -1;
|
||||
}
|
||||
|
||||
json_len = strlen(json);
|
||||
if (fwrite(json, 1, json_len, file) != json_len || fflush(file) != 0) {
|
||||
int saved_errno = errno;
|
||||
|
||||
fclose(file);
|
||||
unlink(temp_path);
|
||||
cJSON_free(json);
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
if (fclose(file) != 0) {
|
||||
int saved_errno = errno;
|
||||
|
||||
unlink(temp_path);
|
||||
cJSON_free(json);
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
if (rename(temp_path, path) != 0) {
|
||||
int saved_errno = errno;
|
||||
|
||||
unlink(temp_path);
|
||||
cJSON_free(json);
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
|
||||
cJSON_free(json);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int write_daemon_status_file(daemon_state_t *state) {
|
||||
cJSON *root;
|
||||
video_pipeline_stats_t video_stats;
|
||||
control_bridge_stats_t control_stats;
|
||||
int rc;
|
||||
|
||||
if (state == NULL) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
if (ensure_runtime_dir(state->runtime_dir) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
memset(&video_stats, 0, sizeof(video_stats));
|
||||
memset(&control_stats, 0, sizeof(control_stats));
|
||||
video_pipeline_stats_snapshot(&state->video_stats, &video_stats);
|
||||
control_bridge_stats_snapshot(&state->control_stats, &control_stats);
|
||||
|
||||
root = cJSON_CreateObject();
|
||||
if (root == NULL) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
|
||||
cJSON_AddNumberToObject(root, "updated_at_epoch_ms", (double) realtime_epoch_ms());
|
||||
cJSON_AddNumberToObject(root, "pid", (double) getpid());
|
||||
cJSON_AddNumberToObject(root, "video_thread_heartbeat_epoch_ms", (double) atomic_load(&state->video_thread_heartbeat_epoch_sec) * 1000.0);
|
||||
cJSON_AddNumberToObject(root, "control_thread_heartbeat_epoch_ms", (double) atomic_load(&state->control_thread_heartbeat_epoch_sec) * 1000.0);
|
||||
cJSON_AddBoolToObject(root, "video_connected", video_stats.connected != 0);
|
||||
cJSON_AddNumberToObject(root, "video_frames_sent", (double) video_stats.frames_sent);
|
||||
cJSON_AddNumberToObject(root, "video_send_errors", (double) video_stats.send_errors);
|
||||
cJSON_AddNumberToObject(root, "video_backlog_resets", (double) video_stats.backlog_resets);
|
||||
cJSON_AddStringToObject(root, "video_last_error", video_stats.last_error);
|
||||
cJSON_AddBoolToObject(root, "control_registered", control_stats.registered != 0);
|
||||
cJSON_AddNumberToObject(root, "control_reconnect_count", (double) control_stats.reconnect_count);
|
||||
cJSON_AddNumberToObject(root, "control_unix_send_errors", (double) control_stats.unix_send_errors);
|
||||
cJSON_AddStringToObject(root, "control_last_error", control_stats.last_error);
|
||||
|
||||
rc = write_status_json_atomic(state->status_file_path, root);
|
||||
cJSON_Delete(root);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int thread_heartbeat_expired(atomic_long *heartbeat, int timeout_sec, long now_sec) {
|
||||
long heartbeat_sec;
|
||||
|
||||
if (heartbeat == NULL || timeout_sec <= 0) {
|
||||
return 0;
|
||||
}
|
||||
heartbeat_sec = atomic_load(heartbeat);
|
||||
if (heartbeat_sec <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return now_sec - heartbeat_sec > timeout_sec;
|
||||
}
|
||||
|
||||
static void exit_if_thread_stalled(daemon_state_t *state) {
|
||||
long now_sec;
|
||||
|
||||
if (state == NULL || state->heartbeat_timeout_sec <= 0) {
|
||||
return;
|
||||
}
|
||||
now_sec = realtime_epoch_sec();
|
||||
if (thread_heartbeat_expired(&state->video_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
|
||||
fprintf(stderr, "[b_side_omnid] video thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
|
||||
fflush(stderr);
|
||||
exit(EXIT_CODE_VIDEO_THREAD_STALLED);
|
||||
}
|
||||
if (thread_heartbeat_expired(&state->control_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
|
||||
fprintf(stderr, "[b_side_omnid] control thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
|
||||
fflush(stderr);
|
||||
exit(EXIT_CODE_CONTROL_THREAD_STALLED);
|
||||
}
|
||||
}
|
||||
|
||||
static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) {
|
||||
if (stats == NULL) {
|
||||
@@ -295,7 +518,10 @@ static void *video_thread_main(void *arg) {
|
||||
daemon_state_t *state = (daemon_state_t *) arg;
|
||||
|
||||
while (!*state->stop_requested) {
|
||||
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
|
||||
maybe_inject_thread_stall(state, state->video_thread_fault_file, "video");
|
||||
int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested);
|
||||
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
|
||||
|
||||
if (video_rc == 0) {
|
||||
break;
|
||||
@@ -318,6 +544,8 @@ static void *control_thread_main(void *arg) {
|
||||
kcp_client_t *client = NULL;
|
||||
int reconnect_immediately = 0;
|
||||
|
||||
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
|
||||
maybe_inject_thread_stall(state, state->control_thread_fault_file, "control");
|
||||
kcp_conn_options_set_control_defaults(&options);
|
||||
client = kcp_client_dial_with_options(
|
||||
state->control_server_addr,
|
||||
@@ -361,8 +589,10 @@ static void *control_thread_main(void *arg) {
|
||||
int rc;
|
||||
kcp_client_state_t client_state;
|
||||
|
||||
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
|
||||
protocol_message_init(&msg);
|
||||
rc = kcp_client_receive_timed(client, &msg, 100);
|
||||
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
|
||||
if (rc == 1) {
|
||||
char reconnect_reason[256];
|
||||
|
||||
@@ -550,6 +780,7 @@ int main(void) {
|
||||
daemon_state_t state;
|
||||
pthread_t video_thread;
|
||||
pthread_t control_thread;
|
||||
long initial_heartbeat;
|
||||
|
||||
memset(&state, 0, sizeof(state));
|
||||
state.stop_requested = &g_stop_requested;
|
||||
@@ -563,10 +794,35 @@ int main(void) {
|
||||
state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID);
|
||||
state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER);
|
||||
state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET);
|
||||
state.runtime_dir = env_or_default("BLITZ_RUNTIME_DIR", DEFAULT_RUNTIME_DIR);
|
||||
state.heartbeat_timeout_sec = env_int_or_default(
|
||||
"BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC",
|
||||
DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC
|
||||
);
|
||||
state.video_config.progress_callback = video_pipeline_heartbeat_progress;
|
||||
state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec;
|
||||
state.control_server_idle_reconnect_ms = env_int_or_default(
|
||||
"OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS",
|
||||
CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS
|
||||
);
|
||||
snprintf(state.status_file_path, sizeof(state.status_file_path), "%s/%s", state.runtime_dir, DEFAULT_STATUS_FILE_NAME);
|
||||
snprintf(
|
||||
state.video_thread_fault_file,
|
||||
sizeof(state.video_thread_fault_file),
|
||||
"%s/%s",
|
||||
state.runtime_dir,
|
||||
DEFAULT_VIDEO_THREAD_FAULT_FILE
|
||||
);
|
||||
snprintf(
|
||||
state.control_thread_fault_file,
|
||||
sizeof(state.control_thread_fault_file),
|
||||
"%s/%s",
|
||||
state.runtime_dir,
|
||||
DEFAULT_CONTROL_THREAD_FAULT_FILE
|
||||
);
|
||||
initial_heartbeat = realtime_epoch_sec();
|
||||
atomic_init(&state.video_thread_heartbeat_epoch_sec, initial_heartbeat);
|
||||
atomic_init(&state.control_thread_heartbeat_epoch_sec, initial_heartbeat);
|
||||
|
||||
if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' ||
|
||||
state.control_server_addr == NULL || state.control_server_addr[0] == '\0') {
|
||||
@@ -624,6 +880,10 @@ int main(void) {
|
||||
while (!g_stop_requested) {
|
||||
sleep(1);
|
||||
print_stats(&state);
|
||||
if (write_daemon_status_file(&state) != 0) {
|
||||
fprintf(stderr, "[b_side_omnid] failed to write status file %s: %s\n", state.status_file_path, strerror(errno));
|
||||
}
|
||||
exit_if_thread_stalled(&state);
|
||||
}
|
||||
|
||||
pthread_join(video_thread, NULL);
|
||||
|
||||
Reference in New Issue
Block a user