feat: 自启动与自恢复机制

This commit is contained in:
2026-04-13 21:55:40 +08:00
parent 2f507a7546
commit 25c68530ba
19 changed files with 1151 additions and 451 deletions

View File

@@ -3,12 +3,16 @@
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdatomic.h>
#include <stdint.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <time.h>
#include <unistd.h>
#include "cJSON.h"
#include "control_protocol.h"
#include "protocol.h"
#include "video_pipeline.h"
@@ -17,6 +21,13 @@
#define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl"
#define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock"
#define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000
#define DEFAULT_RUNTIME_DIR "/run/blitz-robot"
#define DEFAULT_STATUS_FILE_NAME "b-side-omnid.status.json"
#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall"
#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall"
#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15
#define EXIT_CODE_VIDEO_THREAD_STALLED 101
#define EXIT_CODE_CONTROL_THREAD_STALLED 102
typedef struct unix_dgram_client {
int fd;
@@ -52,6 +63,13 @@ typedef struct daemon_state {
const char *control_expected_sender;
const char *control_unix_socket;
int control_server_idle_reconnect_ms;
const char *runtime_dir;
int heartbeat_timeout_sec;
char status_file_path[512];
char video_thread_fault_file[512];
char control_thread_fault_file[512];
atomic_long video_thread_heartbeat_epoch_sec;
atomic_long control_thread_heartbeat_epoch_sec;
unix_dgram_client_t unix_client;
control_bridge_stats_t control_stats;
} daemon_state_t;
@@ -109,6 +127,79 @@ static int env_int_or_default(const char *name, int fallback) {
return parsed;
}
static int64_t realtime_epoch_ms(void) {
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return (int64_t) ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
}
static long realtime_epoch_sec(void) {
return (long) time(NULL);
}
static void update_thread_heartbeat(atomic_long *heartbeat) {
if (heartbeat == NULL) {
return;
}
atomic_store(heartbeat, realtime_epoch_sec());
}
static void video_pipeline_heartbeat_progress(void *context) {
update_thread_heartbeat((atomic_long *) context);
}
static int ensure_runtime_dir(const char *runtime_dir) {
struct stat st;
if (runtime_dir == NULL || runtime_dir[0] == '\0') {
errno = EINVAL;
return -1;
}
if (stat(runtime_dir, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
return 0;
}
errno = ENOTDIR;
return -1;
}
if (errno != ENOENT) {
return -1;
}
if (mkdir(runtime_dir, 0775) != 0 && errno != EEXIST) {
return -1;
}
return 0;
}
static int path_exists(const char *path) {
return path != NULL && path[0] != '\0' && access(path, F_OK) == 0;
}
static int consume_fault_flag(const char *path) {
if (!path_exists(path)) {
return 0;
}
unlink(path);
return 1;
}
static void maybe_inject_thread_stall(daemon_state_t *state, const char *fault_path, const char *thread_name) {
if (state == NULL || fault_path == NULL || thread_name == NULL) {
return;
}
if (!consume_fault_flag(fault_path)) {
return;
}
fprintf(
stderr,
"[b_side_omnid] fault injection requested for %s thread, sleeping past %d second heartbeat timeout\n",
thread_name,
state->heartbeat_timeout_sec
);
sleep((unsigned int) state->heartbeat_timeout_sec + 2U);
}
static int control_bridge_stats_init(control_bridge_stats_t *stats) {
int rc;
if (stats == NULL) {
@@ -132,6 +223,138 @@ static void control_bridge_stats_destroy(control_bridge_stats_t *stats) {
}
static void unix_dgram_client_close(unix_dgram_client_t *client);
static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats);
static int write_status_json_atomic(const char *path, cJSON *root) {
char *json;
char temp_path[640];
FILE *file;
size_t json_len;
if (path == NULL || root == NULL) {
errno = EINVAL;
return -1;
}
json = cJSON_PrintUnformatted(root);
if (json == NULL) {
errno = ENOMEM;
return -1;
}
snprintf(temp_path, sizeof(temp_path), "%s.tmp.%ld", path, (long) getpid());
file = fopen(temp_path, "wb");
if (file == NULL) {
cJSON_free(json);
return -1;
}
json_len = strlen(json);
if (fwrite(json, 1, json_len, file) != json_len || fflush(file) != 0) {
int saved_errno = errno;
fclose(file);
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
if (fclose(file) != 0) {
int saved_errno = errno;
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
if (rename(temp_path, path) != 0) {
int saved_errno = errno;
unlink(temp_path);
cJSON_free(json);
errno = saved_errno;
return -1;
}
cJSON_free(json);
return 0;
}
static int write_daemon_status_file(daemon_state_t *state) {
cJSON *root;
video_pipeline_stats_t video_stats;
control_bridge_stats_t control_stats;
int rc;
if (state == NULL) {
errno = EINVAL;
return -1;
}
if (ensure_runtime_dir(state->runtime_dir) != 0) {
return -1;
}
memset(&video_stats, 0, sizeof(video_stats));
memset(&control_stats, 0, sizeof(control_stats));
video_pipeline_stats_snapshot(&state->video_stats, &video_stats);
control_bridge_stats_snapshot(&state->control_stats, &control_stats);
root = cJSON_CreateObject();
if (root == NULL) {
errno = ENOMEM;
return -1;
}
cJSON_AddNumberToObject(root, "updated_at_epoch_ms", (double) realtime_epoch_ms());
cJSON_AddNumberToObject(root, "pid", (double) getpid());
cJSON_AddNumberToObject(root, "video_thread_heartbeat_epoch_ms", (double) atomic_load(&state->video_thread_heartbeat_epoch_sec) * 1000.0);
cJSON_AddNumberToObject(root, "control_thread_heartbeat_epoch_ms", (double) atomic_load(&state->control_thread_heartbeat_epoch_sec) * 1000.0);
cJSON_AddBoolToObject(root, "video_connected", video_stats.connected != 0);
cJSON_AddNumberToObject(root, "video_frames_sent", (double) video_stats.frames_sent);
cJSON_AddNumberToObject(root, "video_send_errors", (double) video_stats.send_errors);
cJSON_AddNumberToObject(root, "video_backlog_resets", (double) video_stats.backlog_resets);
cJSON_AddStringToObject(root, "video_last_error", video_stats.last_error);
cJSON_AddBoolToObject(root, "control_registered", control_stats.registered != 0);
cJSON_AddNumberToObject(root, "control_reconnect_count", (double) control_stats.reconnect_count);
cJSON_AddNumberToObject(root, "control_unix_send_errors", (double) control_stats.unix_send_errors);
cJSON_AddStringToObject(root, "control_last_error", control_stats.last_error);
rc = write_status_json_atomic(state->status_file_path, root);
cJSON_Delete(root);
return rc;
}
static int thread_heartbeat_expired(atomic_long *heartbeat, int timeout_sec, long now_sec) {
long heartbeat_sec;
if (heartbeat == NULL || timeout_sec <= 0) {
return 0;
}
heartbeat_sec = atomic_load(heartbeat);
if (heartbeat_sec <= 0) {
return 0;
}
return now_sec - heartbeat_sec > timeout_sec;
}
static void exit_if_thread_stalled(daemon_state_t *state) {
long now_sec;
if (state == NULL || state->heartbeat_timeout_sec <= 0) {
return;
}
now_sec = realtime_epoch_sec();
if (thread_heartbeat_expired(&state->video_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
fprintf(stderr, "[b_side_omnid] video thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
fflush(stderr);
exit(EXIT_CODE_VIDEO_THREAD_STALLED);
}
if (thread_heartbeat_expired(&state->control_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) {
fprintf(stderr, "[b_side_omnid] control thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec);
fflush(stderr);
exit(EXIT_CODE_CONTROL_THREAD_STALLED);
}
}
static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) {
if (stats == NULL) {
@@ -295,7 +518,10 @@ static void *video_thread_main(void *arg) {
daemon_state_t *state = (daemon_state_t *) arg;
while (!*state->stop_requested) {
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
maybe_inject_thread_stall(state, state->video_thread_fault_file, "video");
int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested);
update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec);
if (video_rc == 0) {
break;
@@ -318,6 +544,8 @@ static void *control_thread_main(void *arg) {
kcp_client_t *client = NULL;
int reconnect_immediately = 0;
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
maybe_inject_thread_stall(state, state->control_thread_fault_file, "control");
kcp_conn_options_set_control_defaults(&options);
client = kcp_client_dial_with_options(
state->control_server_addr,
@@ -361,8 +589,10 @@ static void *control_thread_main(void *arg) {
int rc;
kcp_client_state_t client_state;
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
protocol_message_init(&msg);
rc = kcp_client_receive_timed(client, &msg, 100);
update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec);
if (rc == 1) {
char reconnect_reason[256];
@@ -550,6 +780,7 @@ int main(void) {
daemon_state_t state;
pthread_t video_thread;
pthread_t control_thread;
long initial_heartbeat;
memset(&state, 0, sizeof(state));
state.stop_requested = &g_stop_requested;
@@ -563,10 +794,35 @@ int main(void) {
state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID);
state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER);
state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET);
state.runtime_dir = env_or_default("BLITZ_RUNTIME_DIR", DEFAULT_RUNTIME_DIR);
state.heartbeat_timeout_sec = env_int_or_default(
"BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC",
DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC
);
state.video_config.progress_callback = video_pipeline_heartbeat_progress;
state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec;
state.control_server_idle_reconnect_ms = env_int_or_default(
"OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS",
CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS
);
snprintf(state.status_file_path, sizeof(state.status_file_path), "%s/%s", state.runtime_dir, DEFAULT_STATUS_FILE_NAME);
snprintf(
state.video_thread_fault_file,
sizeof(state.video_thread_fault_file),
"%s/%s",
state.runtime_dir,
DEFAULT_VIDEO_THREAD_FAULT_FILE
);
snprintf(
state.control_thread_fault_file,
sizeof(state.control_thread_fault_file),
"%s/%s",
state.runtime_dir,
DEFAULT_CONTROL_THREAD_FAULT_FILE
);
initial_heartbeat = realtime_epoch_sec();
atomic_init(&state.video_thread_heartbeat_epoch_sec, initial_heartbeat);
atomic_init(&state.control_thread_heartbeat_epoch_sec, initial_heartbeat);
if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' ||
state.control_server_addr == NULL || state.control_server_addr[0] == '\0') {
@@ -624,6 +880,10 @@ int main(void) {
while (!g_stop_requested) {
sleep(1);
print_stats(&state);
if (write_daemon_status_file(&state) != 0) {
fprintf(stderr, "[b_side_omnid] failed to write status file %s: %s\n", state.status_file_path, strerror(errno));
}
exit_if_thread_stalled(&state);
}
pthread_join(video_thread, NULL);

View File

@@ -18,6 +18,8 @@ typedef struct video_pipeline_packet_metadata {
double longitude;
} video_pipeline_packet_metadata_t;
typedef void (*video_pipeline_progress_fn)(void *context);
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
_Static_assert(sizeof(video_pipeline_packet_metadata_t) == 24, "video trailer metadata must be 24 bytes");
#endif
@@ -39,6 +41,8 @@ typedef struct video_pipeline_config {
int soft_backpressure_segments;
int hard_backpressure_segments;
int hard_backpressure_hold_ms;
video_pipeline_progress_fn progress_callback;
void *progress_context;
} video_pipeline_config_t;
typedef struct video_pipeline_stats {

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import json
import os
import socket
import threading
@@ -90,8 +91,14 @@ class UdpCmdVelReceiver(Node):
self._last_published_command: CommandTuple = ZERO_COMMAND
self._closing = threading.Event()
self._recv_buffer = bytearray(DEFAULT_RECV_BUFFER_BYTES)
self._runtime_dir = os.getenv('BLITZ_RUNTIME_DIR', '/run/blitz-robot').strip() or '/run/blitz-robot'
self._status_path = os.path.join(self._runtime_dir, 'ros-receiver.status.json')
self._transport_reconnect_count = 0
self._recv_thread_heartbeat_epoch_ms = self._now_epoch_ms()
self._runtime_last_error = ''
self.create_timer(1.0 / self._publish_rate_hz, self._publish_tick)
self.create_timer(1.0, self._write_status_tick)
recv_target = self._recv_loop_unix_dgram if self._transport_name == 'unix_dgram' else self._recv_loop
self._recv_thread = threading.Thread(target=recv_target, daemon=True)
@@ -174,6 +181,8 @@ class UdpCmdVelReceiver(Node):
pass
try:
self._transport = self._create_transport()
self._transport_reconnect_count += 1
self._set_runtime_last_error('')
if self._should_log('transport_reconnected', 1.0):
self.get_logger().info(
'Reconnected OmniSocket transport %s://%s as %s'
@@ -182,6 +191,7 @@ class UdpCmdVelReceiver(Node):
return True
except OSError as exc:
self._transport = None
self._set_runtime_last_error(str(exc))
if self._should_log('transport_reconnect_error', 2.0):
self.get_logger().error(f'Failed to reconnect OmniSocket transport: {exc}')
time.sleep(0.5)
@@ -192,10 +202,13 @@ class UdpCmdVelReceiver(Node):
self._close_unix_socket()
try:
self._setup_unix_socket()
self._transport_reconnect_count += 1
self._set_runtime_last_error('')
if self._should_log('unix_rebound', 1.0):
self.get_logger().info(f'Rebound unix datagram socket at {self._local_socket_path}')
return True
except OSError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('unix_rebind_error', 2.0):
self.get_logger().error(f'Failed to rebind unix datagram socket: {exc}')
time.sleep(0.5)
@@ -209,6 +222,61 @@ class UdpCmdVelReceiver(Node):
return True
return False
def _now_epoch_ms(self) -> int:
return time.time_ns() // 1_000_000
def _update_recv_heartbeat(self) -> None:
with self._lock:
self._recv_thread_heartbeat_epoch_ms = self._now_epoch_ms()
def _last_packet_age_ms(self) -> int | None:
with self._lock:
last_packet_monotonic = self._last_packet_monotonic
if last_packet_monotonic is None:
return None
return max(0, int((time.monotonic() - last_packet_monotonic) * 1000.0))
def _socket_bound(self) -> bool:
if self._transport_name == 'unix_dgram':
return self._unix_socket is not None and os.path.exists(self._local_socket_path)
return self._transport is not None
def _set_runtime_last_error(self, message: str) -> None:
self._runtime_last_error = message
def _status_payload(self) -> dict[str, object]:
with self._lock:
recv_thread_heartbeat_epoch_ms = self._recv_thread_heartbeat_epoch_ms
return {
'updated_at_epoch_ms': self._now_epoch_ms(),
'pid': os.getpid(),
'recv_thread_heartbeat_epoch_ms': recv_thread_heartbeat_epoch_ms,
'transport': self._transport_name,
'local_socket_path': self._local_socket_path,
'socket_bound': self._socket_bound(),
'transport_reconnect_count': self._transport_reconnect_count,
'last_packet_age_ms': self._last_packet_age_ms(),
'last_error': self._runtime_last_error,
}
def _write_status_tick(self) -> None:
payload = self._status_payload()
if self._transport_name == 'unix_dgram':
if self._unix_socket is None:
payload['last_error'] = self._runtime_last_error or 'unix datagram socket is not bound'
else:
if self._transport is None:
payload['last_error'] = self._runtime_last_error or 'OmniSocket transport is not connected'
try:
os.makedirs(self._runtime_dir, exist_ok=True)
temp_path = f'{self._status_path}.tmp.{os.getpid()}'
with open(temp_path, 'w', encoding='utf-8') as handle:
json.dump(payload, handle, ensure_ascii=True, separators=(',', ':'))
os.replace(temp_path, self._status_path)
except OSError as exc:
if self._should_log('status_write_error', 5.0):
self.get_logger().warning(f'Failed to write receiver status file: {exc}')
def _publish_command(self, command: CommandTuple) -> None:
msg = TwistStamped()
msg.header.stamp = self.get_clock().now().to_msg()
@@ -229,32 +297,39 @@ class UdpCmdVelReceiver(Node):
def _recv_loop(self) -> None:
while not self._closing.is_set() and rclpy.ok():
self._update_recv_heartbeat()
try:
assert self._transport is not None
meta = self._transport.recv_into(buffer=self._recv_buffer, timeout_ms=100)
except BufferError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('buffer_error', 2.0):
self.get_logger().warning(f'Dropped oversized OmniSocket frame: {exc}')
continue
except OSError as exc:
self._set_runtime_last_error(str(exc))
if not self._closing.is_set() and self._should_log('recv_error', 2.0):
self.get_logger().error(f'OmniSocket receive loop stopped: {exc}')
if not self._reconnect_transport():
return
continue
self._update_recv_heartbeat()
if meta is None:
continue
self._set_runtime_last_error('')
from_peer = str(meta['from'])
msg_type = int(meta['msg_type'])
body_len = int(meta['body_len'])
if msg_type == self._msg_type_error:
self._set_runtime_last_error(f'server error message from {from_peer}')
self._handle_error_message(from_peer, body_len)
continue
if self._expected_sender and from_peer != self._expected_sender:
self._set_runtime_last_error(f'unexpected sender {from_peer}')
if self._should_log('unexpected_sender', 2.0):
self.get_logger().warning(
'Ignoring message from unexpected sender %s (expected %s)'
@@ -263,6 +338,7 @@ class UdpCmdVelReceiver(Node):
continue
if msg_type != self._msg_type_binary:
self._set_runtime_last_error(f'unexpected message type {msg_type}')
if self._should_log('unexpected_type', 2.0):
self.get_logger().warning(
'Ignoring unexpected message type %d from %s (%d bytes)'
@@ -271,6 +347,7 @@ class UdpCmdVelReceiver(Node):
continue
if body_len != PACKET_SIZE:
self._set_runtime_last_error(f'invalid payload size {body_len}')
if self._should_log('packet_size', 2.0):
self.get_logger().warning(
'Dropped binary payload from %s with invalid size %d (expected %d)'
@@ -281,6 +358,7 @@ class UdpCmdVelReceiver(Node):
try:
command = unpack_command(self._recv_buffer[:PACKET_SIZE])
except ValueError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('decode_error', 2.0):
self.get_logger().warning(f'Dropped malformed command payload: {exc}')
continue
@@ -288,15 +366,18 @@ class UdpCmdVelReceiver(Node):
with self._lock:
self._latest_command = command
self._last_packet_monotonic = time.monotonic()
self._set_runtime_last_error('')
def _recv_loop_unix_dgram(self) -> None:
assert self._unix_socket is not None
while not self._closing.is_set() and rclpy.ok():
self._update_recv_heartbeat()
try:
payload = self._unix_socket.recv(DEFAULT_RECV_BUFFER_BYTES)
except socket.timeout:
if not os.path.exists(self._local_socket_path):
self._set_runtime_last_error('unix datagram socket path disappeared')
if self._should_log('unix_socket_missing', 2.0):
self.get_logger().warning(
f'Unix datagram socket path disappeared, rebinding {self._local_socket_path}'
@@ -305,13 +386,16 @@ class UdpCmdVelReceiver(Node):
return
continue
except OSError as exc:
self._set_runtime_last_error(str(exc))
if not self._closing.is_set() and self._should_log('unix_recv_error', 2.0):
self.get_logger().error(f'Unix datagram receive loop stopped: {exc}')
if not self._rebind_unix_socket():
return
continue
self._update_recv_heartbeat()
if len(payload) != PACKET_SIZE:
self._set_runtime_last_error(f'invalid unix datagram payload size {len(payload)}')
if self._should_log('unix_packet_size', 2.0):
self.get_logger().warning(
'Dropped unix datagram payload with invalid size %d (expected %d)'
@@ -322,6 +406,7 @@ class UdpCmdVelReceiver(Node):
try:
command = unpack_command(payload)
except ValueError as exc:
self._set_runtime_last_error(str(exc))
if self._should_log('unix_decode_error', 2.0):
self.get_logger().warning(f'Dropped malformed unix datagram payload: {exc}')
continue
@@ -329,6 +414,7 @@ class UdpCmdVelReceiver(Node):
with self._lock:
self._latest_command = command
self._last_packet_monotonic = time.monotonic()
self._set_runtime_last_error('')
def _command_for_publish_tick(self) -> tuple[CommandTuple, Optional[float], bool]:
with self._lock:

View File

@@ -1,385 +1,210 @@
# 机器人 B 端开机自启说明
# Robot B-Side Boot Chain
这个目录是给机器人端做开机自启用的。
This directory contains the robot-side boot and recovery scripts.
你看到这里多了不少脚本和 `systemd` 单元,不是为了让你手工一条条执行,而是为了把开机流程拆开管理:
1. 固定启动顺序
2. 某一步失败时可单独重试
3. 所有动作统一写到一个本地日志文件
4. 后面如果要把“固定延时 30 秒”换成“等待机器人原有自检完成”,只改最前面的闸门即可
所以平时真正需要人工执行的,通常只有这两步:
Normal usage is:
```bash
sudo bash scripts/boot/install-systemd.sh
sudo systemctl start blitz-robot.target
```
以后机器人重启时,就不需要你再手工执行这些脚本了。
After installation, `blitz-robot.target` is enabled and will start automatically on reboot.
## 启动顺序
To stop the chain now and disable boot-time autostart for future reboots:
当前开机链路如下:
```bash
sudo bash scripts/boot/disable-systemd.sh
```
## Current Startup Order
The current cold-start chain is:
1. `blitz-boot-gate.service`
2. `blitz-5g-dial.service`
3. `blitz-time-sync.service`
4. `blitz-ros-receiver.service`
5. `blitz-b-side-omnid.service`
3. `blitz-ros-receiver.service`
4. `blitz-b-side-omnid.service`
5. `blitz-watchdog.service`
对应业务顺序就是:
There is no longer any automatic time-sync step in the boot chain.
1. 先固定等待 30 秒,给机器人原有自检/自启程序让路
2. 运行 5G 自动拨号
3. 运行时钟同步
4. 启动 `start-ros-receiver.sh`
5. 启动 `start-b-side-omnid.sh`
## What Each Script Does
## 日志文件
- `robot-boot.env`: default boot configuration
- `robot-boot.env.local`: machine-local overrides
- `common.sh`: shared env loading, logging, and helper functions
- `boot-gate.sh`: fixed startup delay gate
- `5g-dial.sh`: brings up the 5G modem path and verifies routing
- `start-ros-receiver-service.sh`: boot wrapper for ROS receiver
- `wait-for-unix-socket.sh`: waits for the ROS receiver unix socket
- `start-b-side-omnid-service.sh`: boot wrapper for `b_side_omnid`
- `blitz-watchdog.sh`: runtime health watchdog and recovery orchestrator
- `blitz-fault-inject.sh`: fault injection entrypoint
- `install-systemd.sh`: installs systemd units into `/etc/systemd/system`
- `disable-systemd.sh`: stops the boot chain and disables autostart
所有关键操作都会统一写到这个本地文件:
## Important Configuration
```text
/var/log/blitz-robot/startup.log
```
每一行日志格式如下:
```text
timestamp | step | action | result | details | exit_code
```
日志里会记录:
- 做了什么
- 实际执行了什么命令
- 前置检查是否通过
- 成功还是失败
- 失败原因
- 退出码
- 是否发生了重试
## 这些文件分别是干什么的
- `robot-boot.env`:开机自启默认配置
- `robot-boot.env.local`:本机覆盖配置,建议把你自己的配置写这里
- `common.sh`:公共环境加载和统一日志函数
- `boot-gate.sh`:启动闸门,当前逻辑是固定等待 30 秒
- `5g-dial.sh`:等待 5G 串口出现,执行 `rndis_dial.py`,删除 5G 默认路由并补齐目标主机路由,然后检查路由是否真的起来
- `time-sync.sh`:把 `chrony` 指向白名单服务器 IP 和端口,并执行一次同步
- `start-ros-receiver-service.sh`:开机版 ROS receiver 启动包装
- `wait-for-unix-socket.sh`:等待 ROS receiver 建好本地 unix socket
- `start-b-side-omnid-service.sh`:开机版 `b_side_omnid` 启动包装
- `install-systemd.sh`:把 `systemd` 单元安装到 `/etc/systemd/system`
- `systemd/*.service.in``systemd/*.target.in``systemd` 模板文件
## 前置条件
你前面说过,除了时钟同步以外,其他程序环境都应该已经配好了。按这个前提,这里只强调必须确认的前置条件。
### 1. 机器人侧必须已有的条件
默认认为下面这些已经具备:
- 系统是 Ubuntu且使用 `systemd`
- `OmniSocketGo` 仓库已经放在机器人上
- `scripts/dev/start-ros-receiver.sh` 原本就能正常启动
- `scripts/dev/start-b-side-omnid.sh` 原本就能正常启动
- `bin/b_side_omnid` 已经提前编译好
- 5G 拨号脚本存在:`/home/nvidia/5g-test/5G/rndis_dial.py`
- 5G 串口设备是:`/dev/ttyUSB7`
注意:
- 开机模式下不会自动编译 `b_side_omnid`
- 如果 `bin/b_side_omnid` 不存在,服务会直接报错并写日志
### 2. 时钟同步需要的前置安装
时钟同步这一步依赖 `chrony`
如果机器人侧没有安装,请先安装:
```bash
sudo apt update
sudo apt install -y chrony
```
安装后建议确认:
```bash
systemctl status chrony
chronyc tracking
```
### 3. 云服务器侧需要的前置条件
因为你的 5G 是白名单网络,所以时钟同步不能依赖公网域名或默认 NTP 池,必须只用你的白名单云服务器 IP。
云服务器侧需要满足:
- 服务器上运行 `chronyd`
- 安全组 / 防火墙放通你实际使用的 UDP 端口
- 机器人能访问这台服务器的 IP
如果云服务器还没有安装 `chrony`,可以参考:
```bash
sudo apt update
sudo apt install -y chrony
sudo systemctl enable chrony
sudo systemctl restart chrony
```
如果你不能使用标准的 `123/udp`,完全可以改成你自己的端口,例如 `10910/udp`
例如云服务器 /etc/chrony/chrony.conf 里改成监听 10910
```conf
port 10910
allow 0/0
```
然后重启:
```bash
sudo systemctl restart chrony
```
机器人端则在 `robot-boot.env.local` 里配置:
```bash
BLITZ_TIME_SERVER_IP="你的云服务器IP"
BLITZ_TIME_SERVER_PORT="10910"
```
这样 `time-sync.sh` 会自动生成:
```conf
server 你的云服务器IP port 10910 iburst
```
注意:这里必须是你自己可控的 `chronyd` 服务端。公网标准 NTP 服务通常只监听 `123/udp`,不能要求它们改到 `10910`
## 需要改哪些配置
不要直接改 `robot-boot.env`,更推荐新建:
Most machine-specific overrides should go into:
```text
scripts/boot/robot-boot.env.local
```
常见要改的是这些:
Typical settings:
```bash
BLITZ_BOOT_DELAY_SEC="30"
BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log"
BLITZ_RUNTIME_DIR="/run/blitz-robot"
BLITZ_5G_DIAL_DIR="/home/nvidia/5g-test/5G"
BLITZ_5G_SERIAL_PORT="/dev/ttyUSB7"
BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot"
BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2"
BLITZ_5G_INTERFACE=""
BLITZ_5G_MODEM_SUBNET="192.168.224.0/22"
BLITZ_5G_GATEWAY="192.168.225.1"
BLITZ_5G_REMOVE_DEFAULT_ROUTE="1"
BLITZ_5G_ROUTE_TARGETS="106.55.173.235"
BLITZ_5G_INFO_JSON="${OMNISOCKETGO_ROOT}/scripts/boot/modem_network_info.json"
BLITZ_TIME_SERVER_IP="你的白名单云服务器IP"
BLITZ_TIME_SERVER_PORT="10910"
BLITZ_TIME_SERVER_IP="81.70.156.140"
BLITZ_ROS_USER="nvidia"
BLITZ_ROS_SOCKET_WAIT_SEC="20"
BLITZ_WATCHDOG_INTERVAL_SEC="5"
BLITZ_HEALTH_STALE_SEC="15"
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
BLITZ_NETWORK_FAIL_THRESHOLD="3"
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
```
如果 `BLITZ_TIME_SERVER_IP` 留空,脚本会自动回退到 `ROBOT_SIDE_OMNISOCKET_SERVER_ADDR` 的 IP 部分。
`BLITZ_TIME_SERVER_IP` is still used, but only as the 5G route/ping health-check target. It is no longer used for automatic clock synchronization.
`BLITZ_5G_REMOVE_DEFAULT_ROUTE="1"` 时,脚本会在 5G 拨号完成后删除该接口上的默认路由,避免整机默认出口切到 5G。此时 `BLITZ_TIME_SERVER_IP``BLITZ_5G_ROUTE_TARGETS` 中的目标 IP 会显式走 5G其它流量继续走有线或 Wi-Fi 的默认路由。
If `BLITZ_TIME_SERVER_IP` is left empty, the scripts fall back to the host part of `ROBOT_SIDE_OMNISOCKET_SERVER_ADDR`.
## 如何安装和使用
## Install Or Upgrade
下面假设你当前目录就在 `OmniSocketGo` 仓库根目录。
### 第一步:准备本机配置
建议先创建:
```bash
cp scripts/boot/robot-boot.env scripts/boot/robot-boot.env.local
```
然后编辑:
```bash
vim scripts/boot/robot-boot.env.local
```
至少确认这几个值是对的:
- `BLITZ_5G_DIAL_DIR`
- `BLITZ_5G_SERIAL_PORT`
- `BLITZ_TIME_SERVER_IP`
- `BLITZ_TIME_SERVER_PORT`
- `BLITZ_ROS_USER`
### 第二步:安装 systemd 单元
执行:
Run:
```bash
sudo bash scripts/boot/install-systemd.sh
sudo systemctl daemon-reload
sudo systemctl restart blitz-robot.target
```
这个安装脚本会做这些事情:
`install-systemd.sh` will also remove any old `blitz-time-sync.service` unit left over from earlier versions.
1. 创建日志目录和日志文件
2. 渲染 `systemd` 模板
3. 把 unit 文件复制到 `/etc/systemd/system`
4. 执行 `systemctl daemon-reload`
5. 执行 `systemctl enable blitz-robot.target`
## Disable Autostart
### 第三步:立刻启动一次
执行:
To stop the currently running services and disable autostart for future reboots:
```bash
sudo bash scripts/boot/disable-systemd.sh
```
To re-enable later:
```bash
sudo bash scripts/boot/install-systemd.sh
sudo systemctl start blitz-robot.target
```
### 第四步:以后重启自动生效
## Logs
因为安装脚本已经做了 `enable`,所以后续机器人重启时会自动拉起,不需要你再手工执行。
如果想手工确认,也可以执行:
```bash
sudo systemctl enable blitz-robot.target
```
## 如何查看是否正常
### 看总日志文件
最直接:
```bash
tail -f /var/log/blitz-robot/startup.log
```
### 看各个服务状态
```bash
systemctl status blitz-robot.target
systemctl status blitz-boot-gate.service
systemctl status blitz-5g-dial.service
systemctl status blitz-time-sync.service
systemctl status blitz-ros-receiver.service
systemctl status blitz-b-side-omnid.service
```
### 看 journal
```bash
journalctl -u blitz-robot.target -u blitz-boot-gate.service -u blitz-5g-dial.service \
-u blitz-time-sync.service -u blitz-ros-receiver.service \
-u blitz-b-side-omnid.service -f
```
## 当前时钟同步会做什么
`time-sync.sh` 当前逻辑是:
1. 读取 `BLITZ_TIME_SERVER_IP`
2. 读取 `BLITZ_TIME_SERVER_PORT`
3. 修改 `/etc/chrony/chrony.conf`
4. 注释掉原有的 `pool``server`
5. 保留一个备份文件:`/etc/chrony/chrony.conf.blitz-bak`
6. 写入:
All boot-chain and watchdog logs are appended to:
```text
/etc/chrony/sources.d/blitz-robot.sources
/var/log/blitz-robot/startup.log
```
7. 生成类似下面这一行:
```conf
server 你的云服务器IP port 10910 iburst
```
8. 重启 `chrony`
9. 执行 `chronyc burst`
10. 执行 `chronyc waitsync`
注意:
- 如果同步超时,会记日志为 `soft_fail`
- 但不会阻塞后面的 ROS 和 `b_side_omnid` 启动
## 常见问题
### 1. 为什么会突然多出这么多脚本?
因为把开机流程拆成了多个稳定的小步骤:
- 更容易排查哪一步失败
- 更容易让 `systemd` 自动重启
- 更容易记录完整日志
- 后续更容易替换“30 秒延时”为真正的机器人 ready 条件
你平时不需要手工逐个执行这些脚本。
### 2. 我是不是要手工跑 `5g-dial.sh`、`time-sync.sh`、`start-ros-receiver-service.sh`
正常情况下不用。
你只需要:
Follow the log live:
```bash
sudo bash scripts/boot/install-systemd.sh
sudo systemctl start blitz-robot.target
sudo tail -f /var/log/blitz-robot/startup.log
```
### 3. 如果时钟同步失败怎么办?
先看:
Check service state:
```bash
tail -f /var/log/blitz-robot/startup.log
systemctl status blitz-time-sync.service
chronyc sources -v
chronyc tracking
sudo systemctl status blitz-robot.target
sudo systemctl status blitz-5g-dial.service
sudo systemctl status blitz-ros-receiver.service
sudo systemctl status blitz-b-side-omnid.service
sudo systemctl status blitz-watchdog.service
```
优先检查:
- `BLITZ_TIME_SERVER_IP` 是否填对
- `BLITZ_TIME_SERVER_PORT` 是否填对
- 云服务器是否真的跑了 `chronyd`
- 云服务器防火墙 / 安全组是否放通你配置的 UDP 端口,例如 `10910`
- 5G 白名单是否确实允许访问这个服务器 IP
### 4. 如果 ROS receiver 没起来怎么办?
先看:
Check systemd journal:
```bash
systemctl status blitz-ros-receiver.service
tail -f /var/log/blitz-robot/startup.log
sudo journalctl -u blitz-robot.target -u blitz-5g-dial.service \
-u blitz-ros-receiver.service -u blitz-b-side-omnid.service \
-u blitz-watchdog.service -f
```
再检查:
## Runtime Status Files
- `/opt/ros/${ROS_DISTRO}/setup.bash` 是否存在
- `${ROS_CONTROL_PY_DIR}/install/setup.bash` 是否存在
- `ROBOT_RECEIVER_LOCAL_SOCKET_PATH` 对应的 socket 是否出现
The runtime status directory is:
### 5. 如果 b_side_omnid 没起来怎么办?
```text
/run/blitz-robot
```
先看:
Key files:
- `b-side-omnid.status.json`
- `ros-receiver.status.json`
- `watchdog.status.json`
Pretty-print them:
```bash
systemctl status blitz-b-side-omnid.service
tail -f /var/log/blitz-robot/startup.log
sudo python3 -m json.tool /run/blitz-robot/watchdog.status.json
sudo python3 -m json.tool /run/blitz-robot/b-side-omnid.status.json
sudo python3 -m json.tool /run/blitz-robot/ros-receiver.status.json
```
再检查:
## Fault Injection
- `bin/b_side_omnid` 是否已经提前编译好
- 摄像头设备是否存在
- `robot-remote.env` / `robot-boot.env.local` 里的地址配置是否正确
Available test commands:
```bash
sudo bash scripts/boot/blitz-fault-inject.sh bside-crash
sudo bash scripts/boot/blitz-fault-inject.sh bside-process-freeze
sudo bash scripts/boot/blitz-fault-inject.sh bside-video-thread-stall
sudo bash scripts/boot/blitz-fault-inject.sh bside-control-thread-stall
sudo bash scripts/boot/blitz-fault-inject.sh ros-crash
sudo bash scripts/boot/blitz-fault-inject.sh ros-freeze
```
For synthetic network fault injection, first enable it in `robot-boot.env.local`:
```bash
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="1"
```
Then restart watchdog and inject:
```bash
sudo systemctl restart blitz-watchdog.service
sudo bash scripts/boot/blitz-fault-inject.sh network-down on
sudo bash scripts/boot/blitz-fault-inject.sh network-down off
```
## Recovery Behavior Summary
- If `b_side_omnid` dies or its status file goes stale, watchdog first tries a targeted `b_side` restart.
- If ROS receiver dies, loses its socket, or its heartbeat goes stale, watchdog performs an ordered full restart:
- stop `b_side`
- restart ROS receiver
- wait for unix socket
- start `b_side`
- If network checks fail repeatedly, watchdog stops `b_side`, runs `5g-dial.sh`, waits for route recovery, and then restores services.
- Camera disappearance is logged as degraded state. Reappearance triggers a `b_side` restart after the device is stable.
## Notes
- `time-sync.sh` and `blitz-time-sync.service` are intentionally removed from the automatic boot path.
- `b_side_omnid` must already be built before boot-time startup.
- `bin/b_side_omnid` missing, ROS env missing, or modem script missing will all show up in `startup.log`.

View File

@@ -0,0 +1,97 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="fault-inject"
B_SIDE_SERVICE="blitz-b-side-omnid.service"
ROS_SERVICE="blitz-ros-receiver.service"
main_pid_for_service() {
local service_name="$1"
systemctl show --property MainPID --value "${service_name}"
}
require_running_pid() {
local service_name="$1"
local pid
pid="$(main_pid_for_service "${service_name}")"
if [[ -z "${pid}" || "${pid}" == "0" ]]; then
blitz_log "${STEP}" "lookup-pid" "failure" "service=${service_name}" 1
exit 1
fi
printf '%s\n' "${pid}"
}
write_fault_flag() {
local flag_name="$1"
local flag_path="${BLITZ_RUNTIME_DIR}/${flag_name}"
printf '%s\n' "$(date +%s)" > "${flag_path}"
blitz_log "${STEP}" "flag-on" "success" "path=${flag_path}" 0
}
clear_fault_flag() {
local flag_name="$1"
local flag_path="${BLITZ_RUNTIME_DIR}/${flag_name}"
rm -f "${flag_path}"
blitz_log "${STEP}" "flag-off" "success" "path=${flag_path}" 0
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_prepare_runtime_dir
case "${1:-}" in
bside-crash)
kill -9 "$(require_running_pid "${B_SIDE_SERVICE}")"
;;
bside-process-freeze)
kill -STOP "$(require_running_pid "${B_SIDE_SERVICE}")"
;;
bside-video-thread-stall)
write_fault_flag "fault-injection-bside-video-thread-stall"
;;
bside-control-thread-stall)
write_fault_flag "fault-injection-bside-control-thread-stall"
;;
ros-crash)
kill -9 "$(require_running_pid "${ROS_SERVICE}")"
;;
ros-freeze)
kill -STOP "$(require_running_pid "${ROS_SERVICE}")"
;;
network-down)
if [[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" != "1" ]]; then
blitz_log "${STEP}" "network-down" "failure" "set BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION=1 first" 1
exit 1
fi
case "${2:-}" in
on)
write_fault_flag "fault-injection-network-down"
;;
off)
clear_fault_flag "fault-injection-network-down"
;;
*)
echo "usage: $0 network-down on|off" >&2
exit 2
;;
esac
;;
*)
cat <<'EOF'
usage:
blitz-fault-inject.sh bside-crash
blitz-fault-inject.sh bside-process-freeze
blitz-fault-inject.sh bside-video-thread-stall
blitz-fault-inject.sh bside-control-thread-stall
blitz-fault-inject.sh ros-crash
blitz-fault-inject.sh ros-freeze
blitz-fault-inject.sh network-down on|off
EOF
exit 2
;;
esac

View File

@@ -0,0 +1,388 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="watchdog"
B_SIDE_SERVICE="blitz-b-side-omnid.service"
ROS_SERVICE="blitz-ros-receiver.service"
B_SIDE_STATUS_FILE=""
ROS_STATUS_FILE=""
WATCHDOG_STATUS_FILE=""
NETWORK_FAULT_FILE=""
CAMERA_MISSING_PREV=0
CAMERA_RECOVERY_STABLE_COUNT=0
NETWORK_FAIL_COUNT=0
NETWORK_COOLDOWN_UNTIL=0
BACKOFF_UNTIL=0
LAST_ACTION="none"
LAST_ACTION_EPOCH_MS=0
FULL_RESTART_WINDOW_START=0
FULL_RESTART_WINDOW_COUNT=0
NETWORK_LAST_INTERFACE=""
declare -A TARGETED_RESTART_WINDOW_START=()
declare -A TARGETED_RESTART_WINDOW_COUNT=()
now_epoch_sec() {
date +%s
}
now_epoch_ms() {
date +%s%3N
}
service_is_active() {
systemctl is-active --quiet "$1"
}
status_file_fresh() {
local path="$1"
local max_age_sec="$2"
local now_sec
local mtime_sec
if [[ ! -f "${path}" ]]; then
return 1
fi
now_sec="$(now_epoch_sec)"
mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)"
(( now_sec - mtime_sec <= max_age_sec ))
}
ros_receiver_status_fresh() {
local path="$1"
local max_age_sec="$2"
local now_epoch_ms_value
now_epoch_ms_value="$(now_epoch_ms)"
python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY'
import json
import sys
path = sys.argv[1]
now_epoch_ms = int(sys.argv[2])
max_age_ms = int(sys.argv[3]) * 1000
try:
with open(path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
except Exception:
raise SystemExit(1)
heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0)
socket_bound = bool(payload.get("socket_bound"))
if heartbeat_ms <= 0 or not socket_bound:
raise SystemExit(1)
raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1)
PY
}
ros_receiver_healthy() {
local max_age_sec="$1"
service_is_active "${ROS_SERVICE}" \
&& [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \
&& status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \
&& ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}"
}
write_watchdog_status() {
local fault_reason="$1"
local recovery_state="$2"
local network_ok="$3"
local camera_ok="$4"
local ros_ok="$5"
local bside_ok="$6"
local tmp_file
tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$"
cat > "${tmp_file}" <<EOF
{
"updated_at_epoch_ms": $(now_epoch_ms),
"fault_reason": "${fault_reason}",
"recovery_state": "${recovery_state}",
"network_ok": ${network_ok},
"camera_ok": ${camera_ok},
"ros_ok": ${ros_ok},
"bside_ok": ${bside_ok},
"network_fail_count": ${NETWORK_FAIL_COUNT},
"targeted_restart_count": $(targeted_restart_total),
"full_restart_count": ${FULL_RESTART_WINDOW_COUNT},
"last_action": "${LAST_ACTION}",
"last_action_epoch_ms": ${LAST_ACTION_EPOCH_MS}
}
EOF
mv -f "${tmp_file}" "${WATCHDOG_STATUS_FILE}"
}
set_last_action() {
LAST_ACTION="$1"
LAST_ACTION_EPOCH_MS="$(now_epoch_ms)"
}
targeted_restart_total() {
local total=0
local key
for key in "${!TARGETED_RESTART_WINDOW_COUNT[@]}"; do
total=$(( total + TARGETED_RESTART_WINDOW_COUNT["${key}"] ))
done
printf '%s\n' "${total}"
}
register_targeted_restart() {
local fault_key="$1"
local now_sec
local window_start
local count
now_sec="$(now_epoch_sec)"
window_start="${TARGETED_RESTART_WINDOW_START["${fault_key}"]:-0}"
count="${TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]:-0}"
if (( window_start == 0 || now_sec - window_start > 60 )); then
window_start="${now_sec}"
count=1
else
count=$(( count + 1 ))
fi
TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}"
TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}"
(( count >= 2 ))
}
record_full_restart() {
local now_sec
now_sec="$(now_epoch_sec)"
if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then
FULL_RESTART_WINDOW_START="${now_sec}"
FULL_RESTART_WINDOW_COUNT=1
else
FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 ))
fi
if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then
BACKOFF_UNTIL=$(( now_sec + 60 ))
fi
}
restart_bside_targeted() {
local fault_key="$1"
local reason="$2"
if register_targeted_restart "${fault_key}"; then
blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0
full_restart_stack "${reason}-escalated"
return 0
fi
set_last_action "restart-bside"
RECOVERY_ACTION_TAKEN=1
blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0
if systemctl restart "${B_SIDE_SERVICE}"; then
blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0
else
rc=$?
blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}"
return "${rc}"
fi
}
full_restart_stack() {
local reason="$1"
local rc
set_last_action "full-restart"
RECOVERY_ACTION_TAKEN=1
recovery_state="recovering"
fault_reason="${reason}"
blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0
systemctl stop "${B_SIDE_SERVICE}" || true
if ! systemctl restart "${ROS_SERVICE}"; then
rc=$?
blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0
if ! bash "${SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then
rc=$?
blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
if ! systemctl start "${B_SIDE_SERVICE}"; then
rc=$?
blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}"
record_full_restart
return "${rc}"
fi
blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0
record_full_restart
}
network_fault_injected() {
[[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]]
}
resolve_network_interface() {
NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)"
[[ -n "${NETWORK_LAST_INTERFACE}" ]]
}
network_is_healthy() {
local route_output
NETWORK_LAST_INTERFACE=""
if network_fault_injected; then
return 1
fi
if ! resolve_network_interface; then
return 1
fi
route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)"
if [[ -z "${route_output}" ]]; then
return 1
fi
ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1
}
wait_for_network_recovery() {
local timeout_sec="$1"
local waited=0
while (( waited < timeout_sec )); do
if network_is_healthy; then
blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0
return 0
fi
if (( waited == 0 || waited % 5 == 0 )); then
blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0
fi
sleep 1
waited=$(( waited + 1 ))
done
blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1
return 1
}
perform_network_recovery() {
local rc=0
set_last_action "network-recovery"
RECOVERY_ACTION_TAKEN=1
blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0
systemctl stop "${B_SIDE_SERVICE}" || true
if ! bash "${SCRIPT_DIR}/5g-dial.sh"; then
rc=$?
blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT}" "${rc}"
return "${rc}"
fi
if ! wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then
rc=$?
blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}"
return "${rc}"
fi
NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC ))
NETWORK_FAIL_COUNT=0
if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
restart_bside_targeted "network" "network-recovered"
return 0
fi
full_restart_stack "network-recovered-ros-unhealthy"
return 0
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
blitz_require_command stat "${STEP}"
blitz_require_command ping "${STEP}"
blitz_require_command python3 "${STEP}"
blitz_prepare_runtime_dir
B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json"
ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json"
WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json"
NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down"
while true; do
fault_reason="none"
recovery_state="ok"
network_ok=1
camera_ok=1
ros_ok=1
bside_ok=1
RECOVERY_ACTION_TAKEN=0
now_sec="$(now_epoch_sec)"
if (( BACKOFF_UNTIL > now_sec )); then
fault_reason="backoff"
recovery_state="backoff"
write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
continue
fi
if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then
recovery_state="recovering"
elif ! network_is_healthy; then
network_ok=0
NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 ))
fault_reason="network_or_robot_unreachable"
recovery_state="recovering"
blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1
if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then
perform_network_recovery || true
fi
else
NETWORK_FAIL_COUNT=0
fi
if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then
camera_ok=0
fault_reason="camera_missing"
recovery_state="degraded"
CAMERA_MISSING_PREV=1
CAMERA_RECOVERY_STABLE_COUNT=0
elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then
CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 ))
recovery_state="recovering"
fault_reason="camera_recovered"
if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then
restart_bside_targeted "camera" "camera-reappeared" || true
CAMERA_MISSING_PREV=0
CAMERA_RECOVERY_STABLE_COUNT=0
fi
else
CAMERA_RECOVERY_STABLE_COUNT=0
fi
if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then
bside_ok=0
fault_reason="bside_status_stale"
recovery_state="recovering"
restart_bside_targeted "bside" "bside-unhealthy" || true
fi
if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then
ros_ok=0
fault_reason="ros_receiver_unhealthy"
recovery_state="recovering"
full_restart_stack "ros-unhealthy" || true
fi
write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}"
sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}"
done

View File

@@ -52,6 +52,7 @@ blitz_load_boot_env() {
export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}"
export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}"
export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-/run/blitz-robot}"
export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}"
export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}"
export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}"
@@ -65,12 +66,14 @@ blitz_load_boot_env() {
export BLITZ_5G_SERIAL_WAIT_SEC="${BLITZ_5G_SERIAL_WAIT_SEC:-60}"
export BLITZ_5G_ROUTE_WAIT_SEC="${BLITZ_5G_ROUTE_WAIT_SEC:-30}"
export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-${default_time_server}}"
export BLITZ_TIME_SERVER_PORT="${BLITZ_TIME_SERVER_PORT:-123}"
export BLITZ_TIME_SYNC_WAIT_SEC="${BLITZ_TIME_SYNC_WAIT_SEC:-60}"
export BLITZ_TIME_SYNC_MAX_OFFSET_SEC="${BLITZ_TIME_SYNC_MAX_OFFSET_SEC:-0.002}"
export BLITZ_TIME_SYNC_INTERVAL_SEC="${BLITZ_TIME_SYNC_INTERVAL_SEC:-1}"
export BLITZ_ROS_USER="${BLITZ_ROS_USER:-nvidia}"
export BLITZ_ROS_SOCKET_WAIT_SEC="${BLITZ_ROS_SOCKET_WAIT_SEC:-20}"
export BLITZ_WATCHDOG_INTERVAL_SEC="${BLITZ_WATCHDOG_INTERVAL_SEC:-5}"
export BLITZ_HEALTH_STALE_SEC="${BLITZ_HEALTH_STALE_SEC:-15}"
export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}"
export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}"
export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}"
export BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION:-0}"
export BLITZ_BOOT_ENV_LOADED="1"
}
@@ -200,3 +203,51 @@ blitz_route_ready() {
printf '%s\n' "${route_output}"
return 0
}
blitz_resolve_5g_interface() {
local explicit_interface="${BLITZ_5G_INTERFACE:-}"
local info_json="${BLITZ_5G_INFO_JSON:-}"
if [[ -n "${explicit_interface}" ]]; then
printf '%s\n' "${explicit_interface}"
return 0
fi
if [[ -z "${info_json}" || ! -f "${info_json}" ]]; then
return 1
fi
python3 - "${info_json}" <<'PY'
import json
import sys
path = sys.argv[1]
try:
with open(path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
except Exception:
raise SystemExit(1)
interface = str(payload.get("interface") or "").strip()
if not interface:
raise SystemExit(1)
print(interface)
PY
}
blitz_prepare_runtime_dir() {
local runtime_dir
blitz_load_boot_env
runtime_dir="${BLITZ_RUNTIME_DIR}"
mkdir -p "${runtime_dir}"
if [[ "${EUID}" -eq 0 ]]; then
chown "root:${BLITZ_ROS_USER}" "${runtime_dir}"
chmod 0775 "${runtime_dir}"
else
chmod 0775 "${runtime_dir}" 2>/dev/null || true
fi
blitz_log "runtime-dir" "prepare" "success" "path=${runtime_dir}" 0
}

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="disable"
SYSTEMD_DEST_DIR="/etc/systemd/system"
UNITS=(
"blitz-watchdog.service"
"blitz-b-side-omnid.service"
"blitz-ros-receiver.service"
"blitz-5g-dial.service"
"blitz-boot-gate.service"
"blitz-robot.target"
)
stop_unit_if_present() {
local unit_name="$1"
local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}"
if [[ ! -f "${unit_path}" ]]; then
return 0
fi
blitz_run "${STEP}" "stop-unit" systemctl stop "${unit_name}" || true
}
disable_unit_if_present() {
local unit_name="$1"
local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}"
if [[ ! -f "${unit_path}" ]]; then
return 0
fi
blitz_run "${STEP}" "disable-unit" systemctl disable "${unit_name}" || true
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
for unit_name in "${UNITS[@]}"; do
stop_unit_if_present "${unit_name}"
done
for unit_name in "${UNITS[@]}"; do
disable_unit_if_present "${unit_name}"
done
blitz_log "${STEP}" "complete" "success" "boot chain stopped and disabled; next reboot will not auto-start blitz services" 0

View File

@@ -30,6 +30,19 @@ install_unit() {
blitz_log "install" "install-unit" "success" "unit=${SYSTEMD_DEST_DIR}/${template_name%.in}" 0
}
remove_unit_if_present() {
local unit_name="$1"
local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}"
if [[ ! -f "${unit_path}" ]]; then
return 0
fi
systemctl disable --now "${unit_name}" >/dev/null 2>&1 || true
rm -f "${unit_path}"
blitz_log "install" "remove-unit" "success" "unit=${unit_path}" 0
}
blitz_load_boot_env
blitz_require_root "install"
blitz_require_command install "install"
@@ -40,13 +53,15 @@ install -d -m 0755 "$(dirname "${BLITZ_LOG_FILE}")"
touch "${BLITZ_LOG_FILE}"
chmod 0644 "${BLITZ_LOG_FILE}"
blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0
blitz_prepare_runtime_dir
install_unit "blitz-boot-gate.service.in"
install_unit "blitz-5g-dial.service.in"
install_unit "blitz-time-sync.service.in"
install_unit "blitz-ros-receiver.service.in"
install_unit "blitz-b-side-omnid.service.in"
install_unit "blitz-watchdog.service.in"
install_unit "blitz-robot.target.in"
remove_unit_if_present "blitz-time-sync.service"
blitz_run "install" "daemon-reload" systemctl daemon-reload
blitz_run "install" "enable-target" systemctl enable blitz-robot.target

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="runtime-dir"
blitz_load_boot_env
blitz_prepare_runtime_dir
blitz_log "${STEP}" "complete" "success" "runtime_dir=${BLITZ_RUNTIME_DIR}" 0

View File

@@ -3,6 +3,7 @@
BLITZ_BOOT_DELAY_SEC="30"
BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log"
BLITZ_RUNTIME_DIR="/run/blitz-robot"
BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot"
BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2"
@@ -18,13 +19,15 @@ BLITZ_5G_ROUTE_WAIT_SEC="30"
# Leave empty to fall back to the host part of ROBOT_SIDE_OMNISOCKET_SERVER_ADDR.
BLITZ_TIME_SERVER_IP="81.70.156.140"
BLITZ_TIME_SERVER_PORT="10910"
BLITZ_TIME_SYNC_WAIT_SEC="30"
BLITZ_TIME_SYNC_MAX_OFFSET_SEC="0.002"
BLITZ_TIME_SYNC_INTERVAL_SEC="1"
BLITZ_ROS_USER="nvidia"
BLITZ_ROS_SOCKET_WAIT_SEC="20"
BLITZ_WATCHDOG_INTERVAL_SEC="5"
BLITZ_HEALTH_STALE_SEC="15"
BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15"
BLITZ_NETWORK_FAIL_THRESHOLD="3"
BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30"
BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0"
# Boot units run b_side_omnid as root directly, so nested sudo must stay off.
B_SIDE_OMNID_USE_SUDO="0"

View File

@@ -1,10 +1,11 @@
[Unit]
Description=Blitz robot b-side omnid
After=blitz-time-sync.service blitz-ros-receiver.service
Wants=blitz-time-sync.service blitz-ros-receiver.service
After=blitz-5g-dial.service blitz-ros-receiver.service
Wants=blitz-5g-dial.service blitz-ros-receiver.service
[Service]
Type=simple
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh
Restart=always
RestartSec=2

View File

@@ -2,9 +2,9 @@
Description=Blitz robot boot chain
Wants=blitz-boot-gate.service
Wants=blitz-5g-dial.service
Wants=blitz-time-sync.service
Wants=blitz-ros-receiver.service
Wants=blitz-b-side-omnid.service
Wants=blitz-watchdog.service
After=multi-user.target
[Install]

View File

@@ -1,11 +1,13 @@
[Unit]
Description=Blitz robot ROS receiver
After=blitz-time-sync.service
Wants=blitz-time-sync.service
After=blitz-5g-dial.service
Wants=blitz-5g-dial.service
[Service]
Type=simple
User=@BLITZ_ROS_USER@
PermissionsStartOnly=true
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh
ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver
Restart=always

View File

@@ -1,14 +0,0 @@
[Unit]
Description=Blitz robot private chrony sync
After=blitz-5g-dial.service
Wants=blitz-5g-dial.service
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/time-sync.sh
StandardOutput=append:@BLITZ_LOG_FILE@
StandardError=append:@BLITZ_LOG_FILE@
[Install]
WantedBy=blitz-robot.target

View File

@@ -0,0 +1,16 @@
[Unit]
Description=Blitz robot health watchdog
After=blitz-b-side-omnid.service blitz-ros-receiver.service
Wants=blitz-b-side-omnid.service blitz-ros-receiver.service
[Service]
Type=simple
ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh
ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-watchdog.sh
Restart=always
RestartSec=5
StandardOutput=append:@BLITZ_LOG_FILE@
StandardError=append:@BLITZ_LOG_FILE@
[Install]
WantedBy=blitz-robot.target

View File

@@ -1,114 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
STEP="time-sync"
CHRONY_SOURCES_DIR="/etc/chrony/sources.d"
CHRONY_SOURCE_FILE="${CHRONY_SOURCES_DIR}/blitz-robot.sources"
CHRONY_MAIN_CONF="/etc/chrony/chrony.conf"
CHRONY_MAIN_CONF_BAK="/etc/chrony/chrony.conf.blitz-bak"
CHRONY_BURST_SAMPLES="${CHRONY_BURST_SAMPLES:-1/2}"
chrony_unit_name() {
if systemctl list-unit-files chrony.service --no-legend 2>/dev/null | grep -q '^chrony\.service'; then
printf '%s\n' "chrony.service"
return 0
fi
if systemctl list-unit-files chronyd.service --no-legend 2>/dev/null | grep -q '^chronyd\.service'; then
printf '%s\n' "chronyd.service"
return 0
fi
printf '%s\n' "chrony.service"
}
ensure_chrony_main_conf() {
local temp_file
blitz_require_file "${CHRONY_MAIN_CONF}" "${STEP}"
mkdir -p "${CHRONY_SOURCES_DIR}"
if [[ ! -f "${CHRONY_MAIN_CONF_BAK}" ]]; then
cp -a "${CHRONY_MAIN_CONF}" "${CHRONY_MAIN_CONF_BAK}"
blitz_log "${STEP}" "backup-config" "success" "backup=${CHRONY_MAIN_CONF_BAK}" 0
fi
temp_file="$(mktemp)"
awk '
/^[[:space:]]*#/ { print; next }
/^[[:space:]]*(pool|server)[[:space:]]+/ {
print "# blitz-managed-disabled " $0
next
}
{ print }
' "${CHRONY_MAIN_CONF}" > "${temp_file}"
if ! grep -Eq '^[[:space:]]*sourcedir[[:space:]]+/etc/chrony/sources\.d([[:space:]]|$)' "${temp_file}"; then
printf '\n# blitz-managed\nsourcedir /etc/chrony/sources.d\n' >> "${temp_file}"
fi
if ! cmp -s "${temp_file}" "${CHRONY_MAIN_CONF}"; then
cp "${temp_file}" "${CHRONY_MAIN_CONF}"
blitz_log "${STEP}" "rewrite-main-config" "success" "commented non-Blitz pool/server entries in ${CHRONY_MAIN_CONF}" 0
else
blitz_log "${STEP}" "rewrite-main-config" "success" "main config already matches Blitz expectations" 0
fi
rm -f "${temp_file}"
}
write_chrony_source_file() {
local temp_file
temp_file="$(mktemp)"
cat <<EOF > "${temp_file}"
# blitz-managed
server ${BLITZ_TIME_SERVER_IP} port ${BLITZ_TIME_SERVER_PORT} iburst
EOF
if [[ ! -f "${CHRONY_SOURCE_FILE}" ]] || ! cmp -s "${temp_file}" "${CHRONY_SOURCE_FILE}"; then
cp "${temp_file}" "${CHRONY_SOURCE_FILE}"
blitz_log "${STEP}" "write-source" "success" "source_file=${CHRONY_SOURCE_FILE} server=${BLITZ_TIME_SERVER_IP} port=${BLITZ_TIME_SERVER_PORT}" 0
else
blitz_log "${STEP}" "write-source" "success" "source_file already matches ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT}" 0
fi
rm -f "${temp_file}"
}
blitz_load_boot_env
blitz_require_root "${STEP}"
blitz_require_command systemctl "${STEP}"
blitz_require_command chronyc "${STEP}"
if [[ -z "${BLITZ_TIME_SERVER_IP}" ]]; then
blitz_log "${STEP}" "precheck" "failure" "BLITZ_TIME_SERVER_IP is empty and no fallback could be derived" 1
exit 1
fi
if ! [[ "${BLITZ_TIME_SERVER_PORT}" =~ ^[0-9]+$ ]] || (( BLITZ_TIME_SERVER_PORT < 1 || BLITZ_TIME_SERVER_PORT > 65535 )); then
blitz_log "${STEP}" "precheck" "failure" "BLITZ_TIME_SERVER_PORT must be an integer between 1 and 65535" 1
exit 1
fi
ensure_chrony_main_conf
write_chrony_source_file
CHRONY_UNIT="$(chrony_unit_name)"
blitz_run "${STEP}" "restart-chrony" systemctl restart "${CHRONY_UNIT}"
blitz_run "${STEP}" "burst" chronyc burst "${CHRONY_BURST_SAMPLES}"
blitz_log "${STEP}" "waitsync" "start" "server=${BLITZ_TIME_SERVER_IP} port=${BLITZ_TIME_SERVER_PORT} wait_sec=${BLITZ_TIME_SYNC_WAIT_SEC} max_offset_sec=${BLITZ_TIME_SYNC_MAX_OFFSET_SEC} interval_sec=${BLITZ_TIME_SYNC_INTERVAL_SEC}" 0
if chronyc waitsync "${BLITZ_TIME_SYNC_WAIT_SEC}" "${BLITZ_TIME_SYNC_MAX_OFFSET_SEC}" 1000 "${BLITZ_TIME_SYNC_INTERVAL_SEC}"; then
blitz_log "${STEP}" "waitsync" "success" "chrony synchronized to ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT}" 0
else
rc=$?
blitz_log "${STEP}" "waitsync" "soft_fail" "chrony did not synchronize to ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT} within the configured timeout" "${rc}"
fi
blitz_log "${STEP}" "tracking" "start" "chronyc tracking" 0
chronyc tracking || true
blitz_log "${STEP}" "sources" "start" "chronyc sources -v" 0
chronyc sources -v || true
blitz_log "${STEP}" "complete" "success" "time-sync step finished" 0

View File

@@ -180,6 +180,13 @@ static void video_pipeline_set_errno_error(video_pipeline_stats_t *stats, const
video_pipeline_set_error(stats, buffer);
}
static void video_pipeline_report_progress(const video_pipeline_config_t *config) {
if (config == NULL || config->progress_callback == NULL) {
return;
}
config->progress_callback(config->progress_context);
}
void video_pipeline_config_init(video_pipeline_config_t *config) {
if (config == NULL) {
return;
@@ -846,6 +853,7 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
memset(&transport_stats, 0, sizeof(transport_stats));
memset(&packet_metadata, 0, sizeof(packet_metadata));
video_pipeline_report_progress(config);
if (config->max_frames > 0 && frame_index >= config->max_frames) {
break;

View File

@@ -161,6 +161,13 @@ static void video_pipeline_set_errno_error(video_pipeline_stats_t *stats, const
video_pipeline_set_error(stats, buffer);
}
static void video_pipeline_report_progress(const video_pipeline_config_t *config) {
if (config == NULL || config->progress_callback == NULL) {
return;
}
config->progress_callback(config->progress_context);
}
void video_pipeline_config_init(video_pipeline_config_t *config) {
if (config == NULL) {
return;
@@ -757,6 +764,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta
double send_end_ms = 0.0;
int frame_number = frame_index + 1;
video_pipeline_report_progress(config);
if (config->max_frames > 0 && frame_index >= config->max_frames) {
break;
}