diff --git a/cmd/b_side_omnid.c b/cmd/b_side_omnid.c index c340c0f..d7e721b 100644 --- a/cmd/b_side_omnid.c +++ b/cmd/b_side_omnid.c @@ -3,12 +3,16 @@ #include #include #include +#include #include #include +#include #include #include +#include #include +#include "cJSON.h" #include "control_protocol.h" #include "protocol.h" #include "video_pipeline.h" @@ -17,6 +21,13 @@ #define CONTROL_DEFAULT_EXPECTED_SENDER "peer-a-ctrl" #define CONTROL_DEFAULT_UNIX_SOCKET "/tmp/omnisocket-b-side-cmd.sock" #define CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS 3000 +#define DEFAULT_RUNTIME_DIR "/run/blitz-robot" +#define DEFAULT_STATUS_FILE_NAME "b-side-omnid.status.json" +#define DEFAULT_VIDEO_THREAD_FAULT_FILE "fault-injection-bside-video-thread-stall" +#define DEFAULT_CONTROL_THREAD_FAULT_FILE "fault-injection-bside-control-thread-stall" +#define DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC 15 +#define EXIT_CODE_VIDEO_THREAD_STALLED 101 +#define EXIT_CODE_CONTROL_THREAD_STALLED 102 typedef struct unix_dgram_client { int fd; @@ -52,6 +63,13 @@ typedef struct daemon_state { const char *control_expected_sender; const char *control_unix_socket; int control_server_idle_reconnect_ms; + const char *runtime_dir; + int heartbeat_timeout_sec; + char status_file_path[512]; + char video_thread_fault_file[512]; + char control_thread_fault_file[512]; + atomic_long video_thread_heartbeat_epoch_sec; + atomic_long control_thread_heartbeat_epoch_sec; unix_dgram_client_t unix_client; control_bridge_stats_t control_stats; } daemon_state_t; @@ -109,6 +127,79 @@ static int env_int_or_default(const char *name, int fallback) { return parsed; } +static int64_t realtime_epoch_ms(void) { + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); + return (int64_t) ts.tv_sec * 1000 + ts.tv_nsec / 1000000; +} + +static long realtime_epoch_sec(void) { + return (long) time(NULL); +} + +static void update_thread_heartbeat(atomic_long *heartbeat) { + if (heartbeat == NULL) { + return; + } + atomic_store(heartbeat, realtime_epoch_sec()); +} + +static void video_pipeline_heartbeat_progress(void *context) { + update_thread_heartbeat((atomic_long *) context); +} + +static int ensure_runtime_dir(const char *runtime_dir) { + struct stat st; + + if (runtime_dir == NULL || runtime_dir[0] == '\0') { + errno = EINVAL; + return -1; + } + if (stat(runtime_dir, &st) == 0) { + if (S_ISDIR(st.st_mode)) { + return 0; + } + errno = ENOTDIR; + return -1; + } + if (errno != ENOENT) { + return -1; + } + if (mkdir(runtime_dir, 0775) != 0 && errno != EEXIST) { + return -1; + } + return 0; +} + +static int path_exists(const char *path) { + return path != NULL && path[0] != '\0' && access(path, F_OK) == 0; +} + +static int consume_fault_flag(const char *path) { + if (!path_exists(path)) { + return 0; + } + unlink(path); + return 1; +} + +static void maybe_inject_thread_stall(daemon_state_t *state, const char *fault_path, const char *thread_name) { + if (state == NULL || fault_path == NULL || thread_name == NULL) { + return; + } + if (!consume_fault_flag(fault_path)) { + return; + } + fprintf( + stderr, + "[b_side_omnid] fault injection requested for %s thread, sleeping past %d second heartbeat timeout\n", + thread_name, + state->heartbeat_timeout_sec + ); + sleep((unsigned int) state->heartbeat_timeout_sec + 2U); +} + static int control_bridge_stats_init(control_bridge_stats_t *stats) { int rc; if (stats == NULL) { @@ -132,6 +223,138 @@ static void control_bridge_stats_destroy(control_bridge_stats_t *stats) { } static void unix_dgram_client_close(unix_dgram_client_t *client); +static void control_bridge_stats_snapshot(control_bridge_stats_t *stats, control_bridge_stats_t *out_stats); + +static int write_status_json_atomic(const char *path, cJSON *root) { + char *json; + char temp_path[640]; + FILE *file; + size_t json_len; + + if (path == NULL || root == NULL) { + errno = EINVAL; + return -1; + } + + json = cJSON_PrintUnformatted(root); + if (json == NULL) { + errno = ENOMEM; + return -1; + } + + snprintf(temp_path, sizeof(temp_path), "%s.tmp.%ld", path, (long) getpid()); + file = fopen(temp_path, "wb"); + if (file == NULL) { + cJSON_free(json); + return -1; + } + + json_len = strlen(json); + if (fwrite(json, 1, json_len, file) != json_len || fflush(file) != 0) { + int saved_errno = errno; + + fclose(file); + unlink(temp_path); + cJSON_free(json); + errno = saved_errno; + return -1; + } + if (fclose(file) != 0) { + int saved_errno = errno; + + unlink(temp_path); + cJSON_free(json); + errno = saved_errno; + return -1; + } + if (rename(temp_path, path) != 0) { + int saved_errno = errno; + + unlink(temp_path); + cJSON_free(json); + errno = saved_errno; + return -1; + } + + cJSON_free(json); + return 0; +} + +static int write_daemon_status_file(daemon_state_t *state) { + cJSON *root; + video_pipeline_stats_t video_stats; + control_bridge_stats_t control_stats; + int rc; + + if (state == NULL) { + errno = EINVAL; + return -1; + } + if (ensure_runtime_dir(state->runtime_dir) != 0) { + return -1; + } + + memset(&video_stats, 0, sizeof(video_stats)); + memset(&control_stats, 0, sizeof(control_stats)); + video_pipeline_stats_snapshot(&state->video_stats, &video_stats); + control_bridge_stats_snapshot(&state->control_stats, &control_stats); + + root = cJSON_CreateObject(); + if (root == NULL) { + errno = ENOMEM; + return -1; + } + + cJSON_AddNumberToObject(root, "updated_at_epoch_ms", (double) realtime_epoch_ms()); + cJSON_AddNumberToObject(root, "pid", (double) getpid()); + cJSON_AddNumberToObject(root, "video_thread_heartbeat_epoch_ms", (double) atomic_load(&state->video_thread_heartbeat_epoch_sec) * 1000.0); + cJSON_AddNumberToObject(root, "control_thread_heartbeat_epoch_ms", (double) atomic_load(&state->control_thread_heartbeat_epoch_sec) * 1000.0); + cJSON_AddBoolToObject(root, "video_connected", video_stats.connected != 0); + cJSON_AddNumberToObject(root, "video_frames_sent", (double) video_stats.frames_sent); + cJSON_AddNumberToObject(root, "video_send_errors", (double) video_stats.send_errors); + cJSON_AddNumberToObject(root, "video_backlog_resets", (double) video_stats.backlog_resets); + cJSON_AddStringToObject(root, "video_last_error", video_stats.last_error); + cJSON_AddBoolToObject(root, "control_registered", control_stats.registered != 0); + cJSON_AddNumberToObject(root, "control_reconnect_count", (double) control_stats.reconnect_count); + cJSON_AddNumberToObject(root, "control_unix_send_errors", (double) control_stats.unix_send_errors); + cJSON_AddStringToObject(root, "control_last_error", control_stats.last_error); + + rc = write_status_json_atomic(state->status_file_path, root); + cJSON_Delete(root); + return rc; +} + +static int thread_heartbeat_expired(atomic_long *heartbeat, int timeout_sec, long now_sec) { + long heartbeat_sec; + + if (heartbeat == NULL || timeout_sec <= 0) { + return 0; + } + heartbeat_sec = atomic_load(heartbeat); + if (heartbeat_sec <= 0) { + return 0; + } + return now_sec - heartbeat_sec > timeout_sec; +} + +static void exit_if_thread_stalled(daemon_state_t *state) { + long now_sec; + + if (state == NULL || state->heartbeat_timeout_sec <= 0) { + return; + } + now_sec = realtime_epoch_sec(); + if (thread_heartbeat_expired(&state->video_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) { + fprintf(stderr, "[b_side_omnid] video thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec); + fflush(stderr); + exit(EXIT_CODE_VIDEO_THREAD_STALLED); + } + if (thread_heartbeat_expired(&state->control_thread_heartbeat_epoch_sec, state->heartbeat_timeout_sec, now_sec)) { + fprintf(stderr, "[b_side_omnid] control thread heartbeat stalled for more than %d seconds\n", state->heartbeat_timeout_sec); + fflush(stderr); + exit(EXIT_CODE_CONTROL_THREAD_STALLED); + } +} static void control_bridge_set_error(control_bridge_stats_t *stats, const char *message) { if (stats == NULL) { @@ -295,7 +518,10 @@ static void *video_thread_main(void *arg) { daemon_state_t *state = (daemon_state_t *) arg; while (!*state->stop_requested) { + update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec); + maybe_inject_thread_stall(state, state->video_thread_fault_file, "video"); int video_rc = video_pipeline_run(&state->video_config, &state->video_stats, state->stop_requested); + update_thread_heartbeat(&state->video_thread_heartbeat_epoch_sec); if (video_rc == 0) { break; @@ -318,6 +544,8 @@ static void *control_thread_main(void *arg) { kcp_client_t *client = NULL; int reconnect_immediately = 0; + update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec); + maybe_inject_thread_stall(state, state->control_thread_fault_file, "control"); kcp_conn_options_set_control_defaults(&options); client = kcp_client_dial_with_options( state->control_server_addr, @@ -361,8 +589,10 @@ static void *control_thread_main(void *arg) { int rc; kcp_client_state_t client_state; + update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec); protocol_message_init(&msg); rc = kcp_client_receive_timed(client, &msg, 100); + update_thread_heartbeat(&state->control_thread_heartbeat_epoch_sec); if (rc == 1) { char reconnect_reason[256]; @@ -550,6 +780,7 @@ int main(void) { daemon_state_t state; pthread_t video_thread; pthread_t control_thread; + long initial_heartbeat; memset(&state, 0, sizeof(state)); state.stop_requested = &g_stop_requested; @@ -563,10 +794,35 @@ int main(void) { state.control_peer_id = env_or_default("OMNI_CONTROL_PEER_ID", CONTROL_DEFAULT_PEER_ID); state.control_expected_sender = env_or_default("OMNI_CONTROL_EXPECTED_SENDER", CONTROL_DEFAULT_EXPECTED_SENDER); state.control_unix_socket = env_or_default("OMNI_CONTROL_UNIX_SOCKET_PATH", CONTROL_DEFAULT_UNIX_SOCKET); + state.runtime_dir = env_or_default("BLITZ_RUNTIME_DIR", DEFAULT_RUNTIME_DIR); + state.heartbeat_timeout_sec = env_int_or_default( + "BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC", + DEFAULT_THREAD_HEARTBEAT_TIMEOUT_SEC + ); + state.video_config.progress_callback = video_pipeline_heartbeat_progress; + state.video_config.progress_context = &state.video_thread_heartbeat_epoch_sec; state.control_server_idle_reconnect_ms = env_int_or_default( "OMNI_CONTROL_SERVER_IDLE_RECONNECT_MS", CONTROL_DEFAULT_SERVER_IDLE_RECONNECT_MS ); + snprintf(state.status_file_path, sizeof(state.status_file_path), "%s/%s", state.runtime_dir, DEFAULT_STATUS_FILE_NAME); + snprintf( + state.video_thread_fault_file, + sizeof(state.video_thread_fault_file), + "%s/%s", + state.runtime_dir, + DEFAULT_VIDEO_THREAD_FAULT_FILE + ); + snprintf( + state.control_thread_fault_file, + sizeof(state.control_thread_fault_file), + "%s/%s", + state.runtime_dir, + DEFAULT_CONTROL_THREAD_FAULT_FILE + ); + initial_heartbeat = realtime_epoch_sec(); + atomic_init(&state.video_thread_heartbeat_epoch_sec, initial_heartbeat); + atomic_init(&state.control_thread_heartbeat_epoch_sec, initial_heartbeat); if (state.video_config.server_addr == NULL || state.video_config.server_addr[0] == '\0' || state.control_server_addr == NULL || state.control_server_addr[0] == '\0') { @@ -624,6 +880,10 @@ int main(void) { while (!g_stop_requested) { sleep(1); print_stats(&state); + if (write_daemon_status_file(&state) != 0) { + fprintf(stderr, "[b_side_omnid] failed to write status file %s: %s\n", state.status_file_path, strerror(errno)); + } + exit_if_thread_stalled(&state); } pthread_join(video_thread, NULL); diff --git a/include/video_pipeline.h b/include/video_pipeline.h index 74cfe1d..ed54dbe 100644 --- a/include/video_pipeline.h +++ b/include/video_pipeline.h @@ -18,6 +18,8 @@ typedef struct video_pipeline_packet_metadata { double longitude; } video_pipeline_packet_metadata_t; +typedef void (*video_pipeline_progress_fn)(void *context); + #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L _Static_assert(sizeof(video_pipeline_packet_metadata_t) == 24, "video trailer metadata must be 24 bytes"); #endif @@ -39,6 +41,8 @@ typedef struct video_pipeline_config { int soft_backpressure_segments; int hard_backpressure_segments; int hard_backpressure_hold_ms; + video_pipeline_progress_fn progress_callback; + void *progress_context; } video_pipeline_config_t; typedef struct video_pipeline_stats { diff --git a/ros-control-py/udp_teleop_bridge/udp_teleop_bridge/udp_cmd_vel_receiver.py b/ros-control-py/udp_teleop_bridge/udp_teleop_bridge/udp_cmd_vel_receiver.py index b954357..8eac4fb 100644 --- a/ros-control-py/udp_teleop_bridge/udp_teleop_bridge/udp_cmd_vel_receiver.py +++ b/ros-control-py/udp_teleop_bridge/udp_teleop_bridge/udp_cmd_vel_receiver.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import os import socket import threading @@ -90,8 +91,14 @@ class UdpCmdVelReceiver(Node): self._last_published_command: CommandTuple = ZERO_COMMAND self._closing = threading.Event() self._recv_buffer = bytearray(DEFAULT_RECV_BUFFER_BYTES) + self._runtime_dir = os.getenv('BLITZ_RUNTIME_DIR', '/run/blitz-robot').strip() or '/run/blitz-robot' + self._status_path = os.path.join(self._runtime_dir, 'ros-receiver.status.json') + self._transport_reconnect_count = 0 + self._recv_thread_heartbeat_epoch_ms = self._now_epoch_ms() + self._runtime_last_error = '' self.create_timer(1.0 / self._publish_rate_hz, self._publish_tick) + self.create_timer(1.0, self._write_status_tick) recv_target = self._recv_loop_unix_dgram if self._transport_name == 'unix_dgram' else self._recv_loop self._recv_thread = threading.Thread(target=recv_target, daemon=True) @@ -174,6 +181,8 @@ class UdpCmdVelReceiver(Node): pass try: self._transport = self._create_transport() + self._transport_reconnect_count += 1 + self._set_runtime_last_error('') if self._should_log('transport_reconnected', 1.0): self.get_logger().info( 'Reconnected OmniSocket transport %s://%s as %s' @@ -182,6 +191,7 @@ class UdpCmdVelReceiver(Node): return True except OSError as exc: self._transport = None + self._set_runtime_last_error(str(exc)) if self._should_log('transport_reconnect_error', 2.0): self.get_logger().error(f'Failed to reconnect OmniSocket transport: {exc}') time.sleep(0.5) @@ -192,10 +202,13 @@ class UdpCmdVelReceiver(Node): self._close_unix_socket() try: self._setup_unix_socket() + self._transport_reconnect_count += 1 + self._set_runtime_last_error('') if self._should_log('unix_rebound', 1.0): self.get_logger().info(f'Rebound unix datagram socket at {self._local_socket_path}') return True except OSError as exc: + self._set_runtime_last_error(str(exc)) if self._should_log('unix_rebind_error', 2.0): self.get_logger().error(f'Failed to rebind unix datagram socket: {exc}') time.sleep(0.5) @@ -209,6 +222,61 @@ class UdpCmdVelReceiver(Node): return True return False + def _now_epoch_ms(self) -> int: + return time.time_ns() // 1_000_000 + + def _update_recv_heartbeat(self) -> None: + with self._lock: + self._recv_thread_heartbeat_epoch_ms = self._now_epoch_ms() + + def _last_packet_age_ms(self) -> int | None: + with self._lock: + last_packet_monotonic = self._last_packet_monotonic + if last_packet_monotonic is None: + return None + return max(0, int((time.monotonic() - last_packet_monotonic) * 1000.0)) + + def _socket_bound(self) -> bool: + if self._transport_name == 'unix_dgram': + return self._unix_socket is not None and os.path.exists(self._local_socket_path) + return self._transport is not None + + def _set_runtime_last_error(self, message: str) -> None: + self._runtime_last_error = message + + def _status_payload(self) -> dict[str, object]: + with self._lock: + recv_thread_heartbeat_epoch_ms = self._recv_thread_heartbeat_epoch_ms + return { + 'updated_at_epoch_ms': self._now_epoch_ms(), + 'pid': os.getpid(), + 'recv_thread_heartbeat_epoch_ms': recv_thread_heartbeat_epoch_ms, + 'transport': self._transport_name, + 'local_socket_path': self._local_socket_path, + 'socket_bound': self._socket_bound(), + 'transport_reconnect_count': self._transport_reconnect_count, + 'last_packet_age_ms': self._last_packet_age_ms(), + 'last_error': self._runtime_last_error, + } + + def _write_status_tick(self) -> None: + payload = self._status_payload() + if self._transport_name == 'unix_dgram': + if self._unix_socket is None: + payload['last_error'] = self._runtime_last_error or 'unix datagram socket is not bound' + else: + if self._transport is None: + payload['last_error'] = self._runtime_last_error or 'OmniSocket transport is not connected' + try: + os.makedirs(self._runtime_dir, exist_ok=True) + temp_path = f'{self._status_path}.tmp.{os.getpid()}' + with open(temp_path, 'w', encoding='utf-8') as handle: + json.dump(payload, handle, ensure_ascii=True, separators=(',', ':')) + os.replace(temp_path, self._status_path) + except OSError as exc: + if self._should_log('status_write_error', 5.0): + self.get_logger().warning(f'Failed to write receiver status file: {exc}') + def _publish_command(self, command: CommandTuple) -> None: msg = TwistStamped() msg.header.stamp = self.get_clock().now().to_msg() @@ -229,32 +297,39 @@ class UdpCmdVelReceiver(Node): def _recv_loop(self) -> None: while not self._closing.is_set() and rclpy.ok(): + self._update_recv_heartbeat() try: assert self._transport is not None meta = self._transport.recv_into(buffer=self._recv_buffer, timeout_ms=100) except BufferError as exc: + self._set_runtime_last_error(str(exc)) if self._should_log('buffer_error', 2.0): self.get_logger().warning(f'Dropped oversized OmniSocket frame: {exc}') continue except OSError as exc: + self._set_runtime_last_error(str(exc)) if not self._closing.is_set() and self._should_log('recv_error', 2.0): self.get_logger().error(f'OmniSocket receive loop stopped: {exc}') if not self._reconnect_transport(): return continue + self._update_recv_heartbeat() if meta is None: continue + self._set_runtime_last_error('') from_peer = str(meta['from']) msg_type = int(meta['msg_type']) body_len = int(meta['body_len']) if msg_type == self._msg_type_error: + self._set_runtime_last_error(f'server error message from {from_peer}') self._handle_error_message(from_peer, body_len) continue if self._expected_sender and from_peer != self._expected_sender: + self._set_runtime_last_error(f'unexpected sender {from_peer}') if self._should_log('unexpected_sender', 2.0): self.get_logger().warning( 'Ignoring message from unexpected sender %s (expected %s)' @@ -263,6 +338,7 @@ class UdpCmdVelReceiver(Node): continue if msg_type != self._msg_type_binary: + self._set_runtime_last_error(f'unexpected message type {msg_type}') if self._should_log('unexpected_type', 2.0): self.get_logger().warning( 'Ignoring unexpected message type %d from %s (%d bytes)' @@ -271,6 +347,7 @@ class UdpCmdVelReceiver(Node): continue if body_len != PACKET_SIZE: + self._set_runtime_last_error(f'invalid payload size {body_len}') if self._should_log('packet_size', 2.0): self.get_logger().warning( 'Dropped binary payload from %s with invalid size %d (expected %d)' @@ -281,6 +358,7 @@ class UdpCmdVelReceiver(Node): try: command = unpack_command(self._recv_buffer[:PACKET_SIZE]) except ValueError as exc: + self._set_runtime_last_error(str(exc)) if self._should_log('decode_error', 2.0): self.get_logger().warning(f'Dropped malformed command payload: {exc}') continue @@ -288,15 +366,18 @@ class UdpCmdVelReceiver(Node): with self._lock: self._latest_command = command self._last_packet_monotonic = time.monotonic() + self._set_runtime_last_error('') def _recv_loop_unix_dgram(self) -> None: assert self._unix_socket is not None while not self._closing.is_set() and rclpy.ok(): + self._update_recv_heartbeat() try: payload = self._unix_socket.recv(DEFAULT_RECV_BUFFER_BYTES) except socket.timeout: if not os.path.exists(self._local_socket_path): + self._set_runtime_last_error('unix datagram socket path disappeared') if self._should_log('unix_socket_missing', 2.0): self.get_logger().warning( f'Unix datagram socket path disappeared, rebinding {self._local_socket_path}' @@ -305,13 +386,16 @@ class UdpCmdVelReceiver(Node): return continue except OSError as exc: + self._set_runtime_last_error(str(exc)) if not self._closing.is_set() and self._should_log('unix_recv_error', 2.0): self.get_logger().error(f'Unix datagram receive loop stopped: {exc}') if not self._rebind_unix_socket(): return continue + self._update_recv_heartbeat() if len(payload) != PACKET_SIZE: + self._set_runtime_last_error(f'invalid unix datagram payload size {len(payload)}') if self._should_log('unix_packet_size', 2.0): self.get_logger().warning( 'Dropped unix datagram payload with invalid size %d (expected %d)' @@ -322,6 +406,7 @@ class UdpCmdVelReceiver(Node): try: command = unpack_command(payload) except ValueError as exc: + self._set_runtime_last_error(str(exc)) if self._should_log('unix_decode_error', 2.0): self.get_logger().warning(f'Dropped malformed unix datagram payload: {exc}') continue @@ -329,6 +414,7 @@ class UdpCmdVelReceiver(Node): with self._lock: self._latest_command = command self._last_packet_monotonic = time.monotonic() + self._set_runtime_last_error('') def _command_for_publish_tick(self) -> tuple[CommandTuple, Optional[float], bool]: with self._lock: diff --git a/scripts/boot/README.md b/scripts/boot/README.md index b8155ce..9f3318d 100644 --- a/scripts/boot/README.md +++ b/scripts/boot/README.md @@ -1,385 +1,210 @@ -# 机器人 B 端开机自启说明 +# Robot B-Side Boot Chain -这个目录是给机器人端做开机自启用的。 +This directory contains the robot-side boot and recovery scripts. -你看到这里多了不少脚本和 `systemd` 单元,不是为了让你手工一条条执行,而是为了把开机流程拆开管理: - -1. 固定启动顺序 -2. 某一步失败时可单独重试 -3. 所有动作统一写到一个本地日志文件 -4. 后面如果要把“固定延时 30 秒”换成“等待机器人原有自检完成”,只改最前面的闸门即可 - -所以平时真正需要人工执行的,通常只有这两步: +Normal usage is: ```bash sudo bash scripts/boot/install-systemd.sh sudo systemctl start blitz-robot.target ``` -以后机器人重启时,就不需要你再手工执行这些脚本了。 +After installation, `blitz-robot.target` is enabled and will start automatically on reboot. -## 启动顺序 +To stop the chain now and disable boot-time autostart for future reboots: -当前开机链路如下: +```bash +sudo bash scripts/boot/disable-systemd.sh +``` + +## Current Startup Order + +The current cold-start chain is: 1. `blitz-boot-gate.service` 2. `blitz-5g-dial.service` -3. `blitz-time-sync.service` -4. `blitz-ros-receiver.service` -5. `blitz-b-side-omnid.service` +3. `blitz-ros-receiver.service` +4. `blitz-b-side-omnid.service` +5. `blitz-watchdog.service` -对应业务顺序就是: +There is no longer any automatic time-sync step in the boot chain. -1. 先固定等待 30 秒,给机器人原有自检/自启程序让路 -2. 运行 5G 自动拨号 -3. 运行时钟同步 -4. 启动 `start-ros-receiver.sh` -5. 启动 `start-b-side-omnid.sh` +## What Each Script Does -## 日志文件 +- `robot-boot.env`: default boot configuration +- `robot-boot.env.local`: machine-local overrides +- `common.sh`: shared env loading, logging, and helper functions +- `boot-gate.sh`: fixed startup delay gate +- `5g-dial.sh`: brings up the 5G modem path and verifies routing +- `start-ros-receiver-service.sh`: boot wrapper for ROS receiver +- `wait-for-unix-socket.sh`: waits for the ROS receiver unix socket +- `start-b-side-omnid-service.sh`: boot wrapper for `b_side_omnid` +- `blitz-watchdog.sh`: runtime health watchdog and recovery orchestrator +- `blitz-fault-inject.sh`: fault injection entrypoint +- `install-systemd.sh`: installs systemd units into `/etc/systemd/system` +- `disable-systemd.sh`: stops the boot chain and disables autostart -所有关键操作都会统一写到这个本地文件: +## Important Configuration -```text -/var/log/blitz-robot/startup.log -``` - -每一行日志格式如下: - -```text -timestamp | step | action | result | details | exit_code -``` - -日志里会记录: - -- 做了什么 -- 实际执行了什么命令 -- 前置检查是否通过 -- 成功还是失败 -- 失败原因 -- 退出码 -- 是否发生了重试 - -## 这些文件分别是干什么的 - -- `robot-boot.env`:开机自启默认配置 -- `robot-boot.env.local`:本机覆盖配置,建议把你自己的配置写这里 -- `common.sh`:公共环境加载和统一日志函数 -- `boot-gate.sh`:启动闸门,当前逻辑是固定等待 30 秒 -- `5g-dial.sh`:等待 5G 串口出现,执行 `rndis_dial.py`,删除 5G 默认路由并补齐目标主机路由,然后检查路由是否真的起来 -- `time-sync.sh`:把 `chrony` 指向白名单服务器 IP 和端口,并执行一次同步 -- `start-ros-receiver-service.sh`:开机版 ROS receiver 启动包装 -- `wait-for-unix-socket.sh`:等待 ROS receiver 建好本地 unix socket -- `start-b-side-omnid-service.sh`:开机版 `b_side_omnid` 启动包装 -- `install-systemd.sh`:把 `systemd` 单元安装到 `/etc/systemd/system` -- `systemd/*.service.in`、`systemd/*.target.in`:`systemd` 模板文件 - -## 前置条件 - -你前面说过,除了时钟同步以外,其他程序环境都应该已经配好了。按这个前提,这里只强调必须确认的前置条件。 - -### 1. 机器人侧必须已有的条件 - -默认认为下面这些已经具备: - -- 系统是 Ubuntu,且使用 `systemd` -- `OmniSocketGo` 仓库已经放在机器人上 -- `scripts/dev/start-ros-receiver.sh` 原本就能正常启动 -- `scripts/dev/start-b-side-omnid.sh` 原本就能正常启动 -- `bin/b_side_omnid` 已经提前编译好 -- 5G 拨号脚本存在:`/home/nvidia/5g-test/5G/rndis_dial.py` -- 5G 串口设备是:`/dev/ttyUSB7` - -注意: - -- 开机模式下不会自动编译 `b_side_omnid` -- 如果 `bin/b_side_omnid` 不存在,服务会直接报错并写日志 - -### 2. 时钟同步需要的前置安装 - -时钟同步这一步依赖 `chrony`。 - -如果机器人侧没有安装,请先安装: - -```bash -sudo apt update -sudo apt install -y chrony -``` - -安装后建议确认: - -```bash -systemctl status chrony -chronyc tracking -``` - -### 3. 云服务器侧需要的前置条件 - -因为你的 5G 是白名单网络,所以时钟同步不能依赖公网域名或默认 NTP 池,必须只用你的白名单云服务器 IP。 - -云服务器侧需要满足: - -- 服务器上运行 `chronyd` -- 安全组 / 防火墙放通你实际使用的 UDP 端口 -- 机器人能访问这台服务器的 IP - -如果云服务器还没有安装 `chrony`,可以参考: - -```bash -sudo apt update -sudo apt install -y chrony -sudo systemctl enable chrony -sudo systemctl restart chrony -``` - -如果你不能使用标准的 `123/udp`,完全可以改成你自己的端口,例如 `10910/udp`。 - -例如云服务器 /etc/chrony/chrony.conf 里改成监听 10910:: - -```conf -port 10910 -allow 0/0 -``` - -然后重启: - -```bash -sudo systemctl restart chrony -``` - -机器人端则在 `robot-boot.env.local` 里配置: - -```bash -BLITZ_TIME_SERVER_IP="你的云服务器IP" -BLITZ_TIME_SERVER_PORT="10910" -``` - -这样 `time-sync.sh` 会自动生成: - -```conf -server 你的云服务器IP port 10910 iburst -``` - -注意:这里必须是你自己可控的 `chronyd` 服务端。公网标准 NTP 服务通常只监听 `123/udp`,不能要求它们改到 `10910`。 - -## 需要改哪些配置 - -不要直接改 `robot-boot.env`,更推荐新建: +Most machine-specific overrides should go into: ```text scripts/boot/robot-boot.env.local ``` -常见要改的是这些: +Typical settings: ```bash BLITZ_BOOT_DELAY_SEC="30" BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log" +BLITZ_RUNTIME_DIR="/run/blitz-robot" -BLITZ_5G_DIAL_DIR="/home/nvidia/5g-test/5G" -BLITZ_5G_SERIAL_PORT="/dev/ttyUSB7" +BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot" +BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2" +BLITZ_5G_INTERFACE="" +BLITZ_5G_MODEM_SUBNET="192.168.224.0/22" BLITZ_5G_GATEWAY="192.168.225.1" BLITZ_5G_REMOVE_DEFAULT_ROUTE="1" BLITZ_5G_ROUTE_TARGETS="106.55.173.235" +BLITZ_5G_INFO_JSON="${OMNISOCKETGO_ROOT}/scripts/boot/modem_network_info.json" -BLITZ_TIME_SERVER_IP="你的白名单云服务器IP" -BLITZ_TIME_SERVER_PORT="10910" +BLITZ_TIME_SERVER_IP="81.70.156.140" BLITZ_ROS_USER="nvidia" +BLITZ_ROS_SOCKET_WAIT_SEC="20" +BLITZ_WATCHDOG_INTERVAL_SEC="5" +BLITZ_HEALTH_STALE_SEC="15" +BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15" +BLITZ_NETWORK_FAIL_THRESHOLD="3" +BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30" +BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0" ``` -如果 `BLITZ_TIME_SERVER_IP` 留空,脚本会自动回退到 `ROBOT_SIDE_OMNISOCKET_SERVER_ADDR` 的 IP 部分。 +`BLITZ_TIME_SERVER_IP` is still used, but only as the 5G route/ping health-check target. It is no longer used for automatic clock synchronization. -当 `BLITZ_5G_REMOVE_DEFAULT_ROUTE="1"` 时,脚本会在 5G 拨号完成后删除该接口上的默认路由,避免整机默认出口切到 5G。此时 `BLITZ_TIME_SERVER_IP` 和 `BLITZ_5G_ROUTE_TARGETS` 中的目标 IP 会显式走 5G,其它流量继续走有线或 Wi-Fi 的默认路由。 +If `BLITZ_TIME_SERVER_IP` is left empty, the scripts fall back to the host part of `ROBOT_SIDE_OMNISOCKET_SERVER_ADDR`. -## 如何安装和使用 +## Install Or Upgrade -下面假设你当前目录就在 `OmniSocketGo` 仓库根目录。 - -### 第一步:准备本机配置 - -建议先创建: - -```bash -cp scripts/boot/robot-boot.env scripts/boot/robot-boot.env.local -``` - -然后编辑: - -```bash -vim scripts/boot/robot-boot.env.local -``` - -至少确认这几个值是对的: - -- `BLITZ_5G_DIAL_DIR` -- `BLITZ_5G_SERIAL_PORT` -- `BLITZ_TIME_SERVER_IP` -- `BLITZ_TIME_SERVER_PORT` -- `BLITZ_ROS_USER` - -### 第二步:安装 systemd 单元 - -执行: +Run: ```bash sudo bash scripts/boot/install-systemd.sh +sudo systemctl daemon-reload +sudo systemctl restart blitz-robot.target ``` -这个安装脚本会做这些事情: +`install-systemd.sh` will also remove any old `blitz-time-sync.service` unit left over from earlier versions. -1. 创建日志目录和日志文件 -2. 渲染 `systemd` 模板 -3. 把 unit 文件复制到 `/etc/systemd/system` -4. 执行 `systemctl daemon-reload` -5. 执行 `systemctl enable blitz-robot.target` +## Disable Autostart -### 第三步:立刻启动一次 - -执行: +To stop the currently running services and disable autostart for future reboots: ```bash +sudo bash scripts/boot/disable-systemd.sh +``` + +To re-enable later: + +```bash +sudo bash scripts/boot/install-systemd.sh sudo systemctl start blitz-robot.target ``` -### 第四步:以后重启自动生效 +## Logs -因为安装脚本已经做了 `enable`,所以后续机器人重启时会自动拉起,不需要你再手工执行。 - -如果想手工确认,也可以执行: - -```bash -sudo systemctl enable blitz-robot.target -``` - -## 如何查看是否正常 - -### 看总日志文件 - -最直接: - -```bash -tail -f /var/log/blitz-robot/startup.log -``` - -### 看各个服务状态 - -```bash -systemctl status blitz-robot.target -systemctl status blitz-boot-gate.service -systemctl status blitz-5g-dial.service -systemctl status blitz-time-sync.service -systemctl status blitz-ros-receiver.service -systemctl status blitz-b-side-omnid.service -``` - -### 看 journal - -```bash -journalctl -u blitz-robot.target -u blitz-boot-gate.service -u blitz-5g-dial.service \ - -u blitz-time-sync.service -u blitz-ros-receiver.service \ - -u blitz-b-side-omnid.service -f -``` - -## 当前时钟同步会做什么 - -`time-sync.sh` 当前逻辑是: - -1. 读取 `BLITZ_TIME_SERVER_IP` -2. 读取 `BLITZ_TIME_SERVER_PORT` -3. 修改 `/etc/chrony/chrony.conf` -4. 注释掉原有的 `pool` 和 `server` 项 -5. 保留一个备份文件:`/etc/chrony/chrony.conf.blitz-bak` -6. 写入: +All boot-chain and watchdog logs are appended to: ```text -/etc/chrony/sources.d/blitz-robot.sources +/var/log/blitz-robot/startup.log ``` -7. 生成类似下面这一行: - -```conf -server 你的云服务器IP port 10910 iburst -``` - -8. 重启 `chrony` -9. 执行 `chronyc burst` -10. 执行 `chronyc waitsync` - -注意: - -- 如果同步超时,会记日志为 `soft_fail` -- 但不会阻塞后面的 ROS 和 `b_side_omnid` 启动 - -## 常见问题 - -### 1. 为什么会突然多出这么多脚本? - -因为把开机流程拆成了多个稳定的小步骤: - -- 更容易排查哪一步失败 -- 更容易让 `systemd` 自动重启 -- 更容易记录完整日志 -- 后续更容易替换“30 秒延时”为真正的机器人 ready 条件 - -你平时不需要手工逐个执行这些脚本。 - -### 2. 我是不是要手工跑 `5g-dial.sh`、`time-sync.sh`、`start-ros-receiver-service.sh`? - -正常情况下不用。 - -你只需要: +Follow the log live: ```bash -sudo bash scripts/boot/install-systemd.sh -sudo systemctl start blitz-robot.target +sudo tail -f /var/log/blitz-robot/startup.log ``` -### 3. 如果时钟同步失败怎么办? - -先看: +Check service state: ```bash -tail -f /var/log/blitz-robot/startup.log -systemctl status blitz-time-sync.service -chronyc sources -v -chronyc tracking +sudo systemctl status blitz-robot.target +sudo systemctl status blitz-5g-dial.service +sudo systemctl status blitz-ros-receiver.service +sudo systemctl status blitz-b-side-omnid.service +sudo systemctl status blitz-watchdog.service ``` -优先检查: - -- `BLITZ_TIME_SERVER_IP` 是否填对 -- `BLITZ_TIME_SERVER_PORT` 是否填对 -- 云服务器是否真的跑了 `chronyd` -- 云服务器防火墙 / 安全组是否放通你配置的 UDP 端口,例如 `10910` -- 5G 白名单是否确实允许访问这个服务器 IP - -### 4. 如果 ROS receiver 没起来怎么办? - -先看: +Check systemd journal: ```bash -systemctl status blitz-ros-receiver.service -tail -f /var/log/blitz-robot/startup.log +sudo journalctl -u blitz-robot.target -u blitz-5g-dial.service \ + -u blitz-ros-receiver.service -u blitz-b-side-omnid.service \ + -u blitz-watchdog.service -f ``` -再检查: +## Runtime Status Files -- `/opt/ros/${ROS_DISTRO}/setup.bash` 是否存在 -- `${ROS_CONTROL_PY_DIR}/install/setup.bash` 是否存在 -- `ROBOT_RECEIVER_LOCAL_SOCKET_PATH` 对应的 socket 是否出现 +The runtime status directory is: -### 5. 如果 b_side_omnid 没起来怎么办? +```text +/run/blitz-robot +``` -先看: +Key files: + +- `b-side-omnid.status.json` +- `ros-receiver.status.json` +- `watchdog.status.json` + +Pretty-print them: ```bash -systemctl status blitz-b-side-omnid.service -tail -f /var/log/blitz-robot/startup.log +sudo python3 -m json.tool /run/blitz-robot/watchdog.status.json +sudo python3 -m json.tool /run/blitz-robot/b-side-omnid.status.json +sudo python3 -m json.tool /run/blitz-robot/ros-receiver.status.json ``` -再检查: +## Fault Injection -- `bin/b_side_omnid` 是否已经提前编译好 -- 摄像头设备是否存在 -- `robot-remote.env` / `robot-boot.env.local` 里的地址配置是否正确 +Available test commands: + +```bash +sudo bash scripts/boot/blitz-fault-inject.sh bside-crash +sudo bash scripts/boot/blitz-fault-inject.sh bside-process-freeze +sudo bash scripts/boot/blitz-fault-inject.sh bside-video-thread-stall +sudo bash scripts/boot/blitz-fault-inject.sh bside-control-thread-stall +sudo bash scripts/boot/blitz-fault-inject.sh ros-crash +sudo bash scripts/boot/blitz-fault-inject.sh ros-freeze +``` + +For synthetic network fault injection, first enable it in `robot-boot.env.local`: + +```bash +BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="1" +``` + +Then restart watchdog and inject: + +```bash +sudo systemctl restart blitz-watchdog.service +sudo bash scripts/boot/blitz-fault-inject.sh network-down on +sudo bash scripts/boot/blitz-fault-inject.sh network-down off +``` + +## Recovery Behavior Summary + +- If `b_side_omnid` dies or its status file goes stale, watchdog first tries a targeted `b_side` restart. +- If ROS receiver dies, loses its socket, or its heartbeat goes stale, watchdog performs an ordered full restart: + - stop `b_side` + - restart ROS receiver + - wait for unix socket + - start `b_side` +- If network checks fail repeatedly, watchdog stops `b_side`, runs `5g-dial.sh`, waits for route recovery, and then restores services. +- Camera disappearance is logged as degraded state. Reappearance triggers a `b_side` restart after the device is stable. + +## Notes + +- `time-sync.sh` and `blitz-time-sync.service` are intentionally removed from the automatic boot path. +- `b_side_omnid` must already be built before boot-time startup. +- `bin/b_side_omnid` missing, ROS env missing, or modem script missing will all show up in `startup.log`. diff --git a/scripts/boot/blitz-fault-inject.sh b/scripts/boot/blitz-fault-inject.sh new file mode 100644 index 0000000..6d8b6e5 --- /dev/null +++ b/scripts/boot/blitz-fault-inject.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="fault-inject" +B_SIDE_SERVICE="blitz-b-side-omnid.service" +ROS_SERVICE="blitz-ros-receiver.service" + +main_pid_for_service() { + local service_name="$1" + systemctl show --property MainPID --value "${service_name}" +} + +require_running_pid() { + local service_name="$1" + local pid + + pid="$(main_pid_for_service "${service_name}")" + if [[ -z "${pid}" || "${pid}" == "0" ]]; then + blitz_log "${STEP}" "lookup-pid" "failure" "service=${service_name}" 1 + exit 1 + fi + printf '%s\n' "${pid}" +} + +write_fault_flag() { + local flag_name="$1" + local flag_path="${BLITZ_RUNTIME_DIR}/${flag_name}" + printf '%s\n' "$(date +%s)" > "${flag_path}" + blitz_log "${STEP}" "flag-on" "success" "path=${flag_path}" 0 +} + +clear_fault_flag() { + local flag_name="$1" + local flag_path="${BLITZ_RUNTIME_DIR}/${flag_name}" + rm -f "${flag_path}" + blitz_log "${STEP}" "flag-off" "success" "path=${flag_path}" 0 +} + +blitz_load_boot_env +blitz_require_root "${STEP}" +blitz_prepare_runtime_dir + +case "${1:-}" in + bside-crash) + kill -9 "$(require_running_pid "${B_SIDE_SERVICE}")" + ;; + bside-process-freeze) + kill -STOP "$(require_running_pid "${B_SIDE_SERVICE}")" + ;; + bside-video-thread-stall) + write_fault_flag "fault-injection-bside-video-thread-stall" + ;; + bside-control-thread-stall) + write_fault_flag "fault-injection-bside-control-thread-stall" + ;; + ros-crash) + kill -9 "$(require_running_pid "${ROS_SERVICE}")" + ;; + ros-freeze) + kill -STOP "$(require_running_pid "${ROS_SERVICE}")" + ;; + network-down) + if [[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" != "1" ]]; then + blitz_log "${STEP}" "network-down" "failure" "set BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION=1 first" 1 + exit 1 + fi + case "${2:-}" in + on) + write_fault_flag "fault-injection-network-down" + ;; + off) + clear_fault_flag "fault-injection-network-down" + ;; + *) + echo "usage: $0 network-down on|off" >&2 + exit 2 + ;; + esac + ;; + *) + cat <<'EOF' +usage: + blitz-fault-inject.sh bside-crash + blitz-fault-inject.sh bside-process-freeze + blitz-fault-inject.sh bside-video-thread-stall + blitz-fault-inject.sh bside-control-thread-stall + blitz-fault-inject.sh ros-crash + blitz-fault-inject.sh ros-freeze + blitz-fault-inject.sh network-down on|off +EOF + exit 2 + ;; +esac diff --git a/scripts/boot/blitz-watchdog.sh b/scripts/boot/blitz-watchdog.sh new file mode 100644 index 0000000..da75343 --- /dev/null +++ b/scripts/boot/blitz-watchdog.sh @@ -0,0 +1,388 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="watchdog" +B_SIDE_SERVICE="blitz-b-side-omnid.service" +ROS_SERVICE="blitz-ros-receiver.service" +B_SIDE_STATUS_FILE="" +ROS_STATUS_FILE="" +WATCHDOG_STATUS_FILE="" +NETWORK_FAULT_FILE="" +CAMERA_MISSING_PREV=0 +CAMERA_RECOVERY_STABLE_COUNT=0 +NETWORK_FAIL_COUNT=0 +NETWORK_COOLDOWN_UNTIL=0 +BACKOFF_UNTIL=0 +LAST_ACTION="none" +LAST_ACTION_EPOCH_MS=0 +FULL_RESTART_WINDOW_START=0 +FULL_RESTART_WINDOW_COUNT=0 +NETWORK_LAST_INTERFACE="" +declare -A TARGETED_RESTART_WINDOW_START=() +declare -A TARGETED_RESTART_WINDOW_COUNT=() + +now_epoch_sec() { + date +%s +} + +now_epoch_ms() { + date +%s%3N +} + +service_is_active() { + systemctl is-active --quiet "$1" +} + +status_file_fresh() { + local path="$1" + local max_age_sec="$2" + local now_sec + local mtime_sec + + if [[ ! -f "${path}" ]]; then + return 1 + fi + now_sec="$(now_epoch_sec)" + mtime_sec="$(stat -c %Y "${path}" 2>/dev/null || echo 0)" + (( now_sec - mtime_sec <= max_age_sec )) +} + +ros_receiver_status_fresh() { + local path="$1" + local max_age_sec="$2" + local now_epoch_ms_value + + now_epoch_ms_value="$(now_epoch_ms)" + python3 - "${path}" "${now_epoch_ms_value}" "${max_age_sec}" <<'PY' +import json +import sys + +path = sys.argv[1] +now_epoch_ms = int(sys.argv[2]) +max_age_ms = int(sys.argv[3]) * 1000 + +try: + with open(path, "r", encoding="utf-8") as handle: + payload = json.load(handle) +except Exception: + raise SystemExit(1) + +heartbeat_ms = int(payload.get("recv_thread_heartbeat_epoch_ms") or 0) +socket_bound = bool(payload.get("socket_bound")) + +if heartbeat_ms <= 0 or not socket_bound: + raise SystemExit(1) + +raise SystemExit(0 if now_epoch_ms - heartbeat_ms <= max_age_ms else 1) +PY +} + +ros_receiver_healthy() { + local max_age_sec="$1" + + service_is_active "${ROS_SERVICE}" \ + && [[ -S "${ROBOT_RECEIVER_LOCAL_SOCKET_PATH}" ]] \ + && status_file_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" \ + && ros_receiver_status_fresh "${ROS_STATUS_FILE}" "${max_age_sec}" +} + +write_watchdog_status() { + local fault_reason="$1" + local recovery_state="$2" + local network_ok="$3" + local camera_ok="$4" + local ros_ok="$5" + local bside_ok="$6" + local tmp_file + + tmp_file="${WATCHDOG_STATUS_FILE}.tmp.$$" + cat > "${tmp_file}" < 60 )); then + window_start="${now_sec}" + count=1 + else + count=$(( count + 1 )) + fi + TARGETED_RESTART_WINDOW_START["${fault_key}"]="${window_start}" + TARGETED_RESTART_WINDOW_COUNT["${fault_key}"]="${count}" + (( count >= 2 )) +} + +record_full_restart() { + local now_sec + + now_sec="$(now_epoch_sec)" + if (( FULL_RESTART_WINDOW_START == 0 || now_sec - FULL_RESTART_WINDOW_START > 600 )); then + FULL_RESTART_WINDOW_START="${now_sec}" + FULL_RESTART_WINDOW_COUNT=1 + else + FULL_RESTART_WINDOW_COUNT=$(( FULL_RESTART_WINDOW_COUNT + 1 )) + fi + if (( FULL_RESTART_WINDOW_COUNT >= 3 )); then + BACKOFF_UNTIL=$(( now_sec + 60 )) + fi +} + +restart_bside_targeted() { + local fault_key="$1" + local reason="$2" + + if register_targeted_restart "${fault_key}"; then + blitz_log "${STEP}" "escalate-full-restart" "start" "reason=${reason}" 0 + full_restart_stack "${reason}-escalated" + return 0 + fi + + set_last_action "restart-bside" + RECOVERY_ACTION_TAKEN=1 + blitz_log "${STEP}" "restart-bside" "start" "reason=${reason}" 0 + if systemctl restart "${B_SIDE_SERVICE}"; then + blitz_log "${STEP}" "restart-bside" "success" "reason=${reason}" 0 + else + rc=$? + blitz_log "${STEP}" "restart-bside" "failure" "reason=${reason}" "${rc}" + return "${rc}" + fi +} + +full_restart_stack() { + local reason="$1" + local rc + + set_last_action "full-restart" + RECOVERY_ACTION_TAKEN=1 + recovery_state="recovering" + fault_reason="${reason}" + + blitz_log "${STEP}" "full-restart-stop-bside" "start" "reason=${reason}" 0 + systemctl stop "${B_SIDE_SERVICE}" || true + + if ! systemctl restart "${ROS_SERVICE}"; then + rc=$? + blitz_log "${STEP}" "full-restart-restart-ros" "failure" "reason=${reason}" "${rc}" + record_full_restart + return "${rc}" + fi + blitz_log "${STEP}" "full-restart-restart-ros" "success" "reason=${reason}" 0 + + if ! bash "${SCRIPT_DIR}/wait-for-unix-socket.sh" --step "${STEP}" --timeout "${BLITZ_ROS_SOCKET_WAIT_SEC}"; then + rc=$? + blitz_log "${STEP}" "full-restart-wait-socket" "failure" "reason=${reason}" "${rc}" + record_full_restart + return "${rc}" + fi + + if ! systemctl start "${B_SIDE_SERVICE}"; then + rc=$? + blitz_log "${STEP}" "full-restart-start-bside" "failure" "reason=${reason}" "${rc}" + record_full_restart + return "${rc}" + fi + blitz_log "${STEP}" "full-restart-start-bside" "success" "reason=${reason}" 0 + record_full_restart +} + +network_fault_injected() { + [[ "${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION}" == "1" && -f "${NETWORK_FAULT_FILE}" ]] +} + +resolve_network_interface() { + NETWORK_LAST_INTERFACE="$(blitz_resolve_5g_interface || true)" + [[ -n "${NETWORK_LAST_INTERFACE}" ]] +} + +network_is_healthy() { + local route_output + + NETWORK_LAST_INTERFACE="" + if network_fault_injected; then + return 1 + fi + if ! resolve_network_interface; then + return 1 + fi + route_output="$(blitz_route_ready "${BLITZ_TIME_SERVER_IP}" "${NETWORK_LAST_INTERFACE}" || true)" + if [[ -z "${route_output}" ]]; then + return 1 + fi + ping -I "${NETWORK_LAST_INTERFACE}" -c 1 -W 2 "${BLITZ_TIME_SERVER_IP}" >/dev/null 2>&1 +} + +wait_for_network_recovery() { + local timeout_sec="$1" + local waited=0 + + while (( waited < timeout_sec )); do + if network_is_healthy; then + blitz_log "${STEP}" "network-postcheck" "success" "interface=${NETWORK_LAST_INTERFACE} waited_sec=${waited}" 0 + return 0 + fi + if (( waited == 0 || waited % 5 == 0 )); then + blitz_log "${STEP}" "network-postcheck" "waiting" "interface=${NETWORK_LAST_INTERFACE:-unresolved} waited_sec=${waited}" 0 + fi + sleep 1 + waited=$(( waited + 1 )) + done + + blitz_log "${STEP}" "network-postcheck" "failure" "interface=${NETWORK_LAST_INTERFACE:-unresolved} timeout_sec=${timeout_sec}" 1 + return 1 +} + +perform_network_recovery() { + local rc=0 + + set_last_action "network-recovery" + RECOVERY_ACTION_TAKEN=1 + blitz_log "${STEP}" "network-recovery" "start" "fail_count=${NETWORK_FAIL_COUNT}" 0 + systemctl stop "${B_SIDE_SERVICE}" || true + + if ! bash "${SCRIPT_DIR}/5g-dial.sh"; then + rc=$? + blitz_log "${STEP}" "network-redial" "failure" "fail_count=${NETWORK_FAIL_COUNT}" "${rc}" + return "${rc}" + fi + + if ! wait_for_network_recovery "${BLITZ_5G_ROUTE_WAIT_SEC}"; then + rc=$? + blitz_log "${STEP}" "network-recovery" "failure" "fail_count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" "${rc}" + return "${rc}" + fi + + NETWORK_COOLDOWN_UNTIL=$(( $(now_epoch_sec) + BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC )) + NETWORK_FAIL_COUNT=0 + if ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then + restart_bside_targeted "network" "network-recovered" + return 0 + fi + full_restart_stack "network-recovered-ros-unhealthy" + return 0 +} + +blitz_load_boot_env +blitz_require_root "${STEP}" +blitz_require_command systemctl "${STEP}" +blitz_require_command stat "${STEP}" +blitz_require_command ping "${STEP}" +blitz_require_command python3 "${STEP}" +blitz_prepare_runtime_dir + +B_SIDE_STATUS_FILE="${BLITZ_RUNTIME_DIR}/b-side-omnid.status.json" +ROS_STATUS_FILE="${BLITZ_RUNTIME_DIR}/ros-receiver.status.json" +WATCHDOG_STATUS_FILE="${BLITZ_RUNTIME_DIR}/watchdog.status.json" +NETWORK_FAULT_FILE="${BLITZ_RUNTIME_DIR}/fault-injection-network-down" + +while true; do + fault_reason="none" + recovery_state="ok" + network_ok=1 + camera_ok=1 + ros_ok=1 + bside_ok=1 + RECOVERY_ACTION_TAKEN=0 + now_sec="$(now_epoch_sec)" + + if (( BACKOFF_UNTIL > now_sec )); then + fault_reason="backoff" + recovery_state="backoff" + write_watchdog_status "${fault_reason}" "${recovery_state}" 0 0 0 0 + sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}" + continue + fi + + if (( NETWORK_COOLDOWN_UNTIL > now_sec )); then + recovery_state="recovering" + elif ! network_is_healthy; then + network_ok=0 + NETWORK_FAIL_COUNT=$(( NETWORK_FAIL_COUNT + 1 )) + fault_reason="network_or_robot_unreachable" + recovery_state="recovering" + blitz_log "${STEP}" "network-check" "failure" "count=${NETWORK_FAIL_COUNT} interface=${NETWORK_LAST_INTERFACE:-unresolved}" 1 + if (( NETWORK_FAIL_COUNT >= BLITZ_NETWORK_FAIL_THRESHOLD )); then + perform_network_recovery || true + fi + else + NETWORK_FAIL_COUNT=0 + fi + + if [[ ! -e "${OMNI_CAMERA_DEVICE}" ]]; then + camera_ok=0 + fault_reason="camera_missing" + recovery_state="degraded" + CAMERA_MISSING_PREV=1 + CAMERA_RECOVERY_STABLE_COUNT=0 + elif (( RECOVERY_ACTION_TAKEN == 0 && CAMERA_MISSING_PREV == 1 )); then + CAMERA_RECOVERY_STABLE_COUNT=$(( CAMERA_RECOVERY_STABLE_COUNT + 1 )) + recovery_state="recovering" + fault_reason="camera_recovered" + if (( CAMERA_RECOVERY_STABLE_COUNT >= 2 )); then + restart_bside_targeted "camera" "camera-reappeared" || true + CAMERA_MISSING_PREV=0 + CAMERA_RECOVERY_STABLE_COUNT=0 + fi + else + CAMERA_RECOVERY_STABLE_COUNT=0 + fi + + if (( RECOVERY_ACTION_TAKEN == 0 )) && { ! service_is_active "${B_SIDE_SERVICE}" || ! status_file_fresh "${B_SIDE_STATUS_FILE}" "${BLITZ_HEALTH_STALE_SEC}"; }; then + bside_ok=0 + fault_reason="bside_status_stale" + recovery_state="recovering" + restart_bside_targeted "bside" "bside-unhealthy" || true + fi + + if (( RECOVERY_ACTION_TAKEN == 0 )) && ! ros_receiver_healthy "${BLITZ_HEALTH_STALE_SEC}"; then + ros_ok=0 + fault_reason="ros_receiver_unhealthy" + recovery_state="recovering" + full_restart_stack "ros-unhealthy" || true + fi + + write_watchdog_status "${fault_reason}" "${recovery_state}" "${network_ok}" "${camera_ok}" "${ros_ok}" "${bside_ok}" + sleep "${BLITZ_WATCHDOG_INTERVAL_SEC}" +done diff --git a/scripts/boot/common.sh b/scripts/boot/common.sh index 669c98d..bf9256a 100644 --- a/scripts/boot/common.sh +++ b/scripts/boot/common.sh @@ -52,6 +52,7 @@ blitz_load_boot_env() { export BLITZ_BOOT_DELAY_SEC="${BLITZ_BOOT_DELAY_SEC:-30}" export BLITZ_LOG_FILE="${BLITZ_LOG_FILE:-/var/log/blitz-robot/startup.log}" + export BLITZ_RUNTIME_DIR="${BLITZ_RUNTIME_DIR:-/run/blitz-robot}" export BLITZ_5G_DIAL_DIR="${BLITZ_5G_DIAL_DIR:-${BOOT_SCRIPT_DIR}}" export BLITZ_5G_SERIAL_PORT="${BLITZ_5G_SERIAL_PORT:-/dev/ttyUSB7}" export BLITZ_5G_INTERFACE="${BLITZ_5G_INTERFACE:-}" @@ -65,12 +66,14 @@ blitz_load_boot_env() { export BLITZ_5G_SERIAL_WAIT_SEC="${BLITZ_5G_SERIAL_WAIT_SEC:-60}" export BLITZ_5G_ROUTE_WAIT_SEC="${BLITZ_5G_ROUTE_WAIT_SEC:-30}" export BLITZ_TIME_SERVER_IP="${BLITZ_TIME_SERVER_IP:-${default_time_server}}" - export BLITZ_TIME_SERVER_PORT="${BLITZ_TIME_SERVER_PORT:-123}" - export BLITZ_TIME_SYNC_WAIT_SEC="${BLITZ_TIME_SYNC_WAIT_SEC:-60}" - export BLITZ_TIME_SYNC_MAX_OFFSET_SEC="${BLITZ_TIME_SYNC_MAX_OFFSET_SEC:-0.002}" - export BLITZ_TIME_SYNC_INTERVAL_SEC="${BLITZ_TIME_SYNC_INTERVAL_SEC:-1}" export BLITZ_ROS_USER="${BLITZ_ROS_USER:-nvidia}" export BLITZ_ROS_SOCKET_WAIT_SEC="${BLITZ_ROS_SOCKET_WAIT_SEC:-20}" + export BLITZ_WATCHDOG_INTERVAL_SEC="${BLITZ_WATCHDOG_INTERVAL_SEC:-5}" + export BLITZ_HEALTH_STALE_SEC="${BLITZ_HEALTH_STALE_SEC:-15}" + export BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="${BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC:-15}" + export BLITZ_NETWORK_FAIL_THRESHOLD="${BLITZ_NETWORK_FAIL_THRESHOLD:-3}" + export BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="${BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC:-30}" + export BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="${BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION:-0}" export BLITZ_BOOT_ENV_LOADED="1" } @@ -200,3 +203,51 @@ blitz_route_ready() { printf '%s\n' "${route_output}" return 0 } + +blitz_resolve_5g_interface() { + local explicit_interface="${BLITZ_5G_INTERFACE:-}" + local info_json="${BLITZ_5G_INFO_JSON:-}" + + if [[ -n "${explicit_interface}" ]]; then + printf '%s\n' "${explicit_interface}" + return 0 + fi + if [[ -z "${info_json}" || ! -f "${info_json}" ]]; then + return 1 + fi + + python3 - "${info_json}" <<'PY' +import json +import sys + +path = sys.argv[1] + +try: + with open(path, "r", encoding="utf-8") as handle: + payload = json.load(handle) +except Exception: + raise SystemExit(1) + +interface = str(payload.get("interface") or "").strip() +if not interface: + raise SystemExit(1) + +print(interface) +PY +} + +blitz_prepare_runtime_dir() { + local runtime_dir + + blitz_load_boot_env + runtime_dir="${BLITZ_RUNTIME_DIR}" + + mkdir -p "${runtime_dir}" + if [[ "${EUID}" -eq 0 ]]; then + chown "root:${BLITZ_ROS_USER}" "${runtime_dir}" + chmod 0775 "${runtime_dir}" + else + chmod 0775 "${runtime_dir}" 2>/dev/null || true + fi + blitz_log "runtime-dir" "prepare" "success" "path=${runtime_dir}" 0 +} diff --git a/scripts/boot/disable-systemd.sh b/scripts/boot/disable-systemd.sh new file mode 100644 index 0000000..b7da8f5 --- /dev/null +++ b/scripts/boot/disable-systemd.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="disable" +SYSTEMD_DEST_DIR="/etc/systemd/system" +UNITS=( + "blitz-watchdog.service" + "blitz-b-side-omnid.service" + "blitz-ros-receiver.service" + "blitz-5g-dial.service" + "blitz-boot-gate.service" + "blitz-robot.target" +) + +stop_unit_if_present() { + local unit_name="$1" + local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}" + + if [[ ! -f "${unit_path}" ]]; then + return 0 + fi + blitz_run "${STEP}" "stop-unit" systemctl stop "${unit_name}" || true +} + +disable_unit_if_present() { + local unit_name="$1" + local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}" + + if [[ ! -f "${unit_path}" ]]; then + return 0 + fi + blitz_run "${STEP}" "disable-unit" systemctl disable "${unit_name}" || true +} + +blitz_load_boot_env +blitz_require_root "${STEP}" +blitz_require_command systemctl "${STEP}" + +for unit_name in "${UNITS[@]}"; do + stop_unit_if_present "${unit_name}" +done + +for unit_name in "${UNITS[@]}"; do + disable_unit_if_present "${unit_name}" +done + +blitz_log "${STEP}" "complete" "success" "boot chain stopped and disabled; next reboot will not auto-start blitz services" 0 diff --git a/scripts/boot/install-systemd.sh b/scripts/boot/install-systemd.sh index 91744cf..9c91e3d 100644 --- a/scripts/boot/install-systemd.sh +++ b/scripts/boot/install-systemd.sh @@ -30,6 +30,19 @@ install_unit() { blitz_log "install" "install-unit" "success" "unit=${SYSTEMD_DEST_DIR}/${template_name%.in}" 0 } +remove_unit_if_present() { + local unit_name="$1" + local unit_path="${SYSTEMD_DEST_DIR}/${unit_name}" + + if [[ ! -f "${unit_path}" ]]; then + return 0 + fi + + systemctl disable --now "${unit_name}" >/dev/null 2>&1 || true + rm -f "${unit_path}" + blitz_log "install" "remove-unit" "success" "unit=${unit_path}" 0 +} + blitz_load_boot_env blitz_require_root "install" blitz_require_command install "install" @@ -40,13 +53,15 @@ install -d -m 0755 "$(dirname "${BLITZ_LOG_FILE}")" touch "${BLITZ_LOG_FILE}" chmod 0644 "${BLITZ_LOG_FILE}" blitz_log "install" "prepare-log-file" "success" "log_file=${BLITZ_LOG_FILE}" 0 +blitz_prepare_runtime_dir install_unit "blitz-boot-gate.service.in" install_unit "blitz-5g-dial.service.in" -install_unit "blitz-time-sync.service.in" install_unit "blitz-ros-receiver.service.in" install_unit "blitz-b-side-omnid.service.in" +install_unit "blitz-watchdog.service.in" install_unit "blitz-robot.target.in" +remove_unit_if_present "blitz-time-sync.service" blitz_run "install" "daemon-reload" systemctl daemon-reload blitz_run "install" "enable-target" systemctl enable blitz-robot.target diff --git a/scripts/boot/prepare-runtime-dir.sh b/scripts/boot/prepare-runtime-dir.sh new file mode 100644 index 0000000..c2b954a --- /dev/null +++ b/scripts/boot/prepare-runtime-dir.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +STEP="runtime-dir" + +blitz_load_boot_env +blitz_prepare_runtime_dir +blitz_log "${STEP}" "complete" "success" "runtime_dir=${BLITZ_RUNTIME_DIR}" 0 diff --git a/scripts/boot/robot-boot.env b/scripts/boot/robot-boot.env index 953cd75..a670f73 100644 --- a/scripts/boot/robot-boot.env +++ b/scripts/boot/robot-boot.env @@ -3,6 +3,7 @@ BLITZ_BOOT_DELAY_SEC="30" BLITZ_LOG_FILE="/var/log/blitz-robot/startup.log" +BLITZ_RUNTIME_DIR="/run/blitz-robot" BLITZ_5G_DIAL_DIR="${OMNISOCKETGO_ROOT}/scripts/boot" BLITZ_5G_SERIAL_PORT="/dev/ttyUSB2" @@ -18,13 +19,15 @@ BLITZ_5G_ROUTE_WAIT_SEC="30" # Leave empty to fall back to the host part of ROBOT_SIDE_OMNISOCKET_SERVER_ADDR. BLITZ_TIME_SERVER_IP="81.70.156.140" -BLITZ_TIME_SERVER_PORT="10910" -BLITZ_TIME_SYNC_WAIT_SEC="30" -BLITZ_TIME_SYNC_MAX_OFFSET_SEC="0.002" -BLITZ_TIME_SYNC_INTERVAL_SEC="1" BLITZ_ROS_USER="nvidia" BLITZ_ROS_SOCKET_WAIT_SEC="20" +BLITZ_WATCHDOG_INTERVAL_SEC="5" +BLITZ_HEALTH_STALE_SEC="15" +BLITZ_OMNID_THREAD_HEARTBEAT_TIMEOUT_SEC="15" +BLITZ_NETWORK_FAIL_THRESHOLD="3" +BLITZ_NETWORK_RECOVERY_COOLDOWN_SEC="30" +BLITZ_WATCHDOG_ALLOW_FAULT_INJECTION="0" # Boot units run b_side_omnid as root directly, so nested sudo must stay off. B_SIDE_OMNID_USE_SUDO="0" diff --git a/scripts/boot/systemd/blitz-b-side-omnid.service.in b/scripts/boot/systemd/blitz-b-side-omnid.service.in index 2269ace..4fdcc43 100644 --- a/scripts/boot/systemd/blitz-b-side-omnid.service.in +++ b/scripts/boot/systemd/blitz-b-side-omnid.service.in @@ -1,10 +1,11 @@ [Unit] Description=Blitz robot b-side omnid -After=blitz-time-sync.service blitz-ros-receiver.service -Wants=blitz-time-sync.service blitz-ros-receiver.service +After=blitz-5g-dial.service blitz-ros-receiver.service +Wants=blitz-5g-dial.service blitz-ros-receiver.service [Service] Type=simple +ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-b-side-omnid-service.sh Restart=always RestartSec=2 diff --git a/scripts/boot/systemd/blitz-robot.target.in b/scripts/boot/systemd/blitz-robot.target.in index 299220b..f2a3136 100644 --- a/scripts/boot/systemd/blitz-robot.target.in +++ b/scripts/boot/systemd/blitz-robot.target.in @@ -2,9 +2,9 @@ Description=Blitz robot boot chain Wants=blitz-boot-gate.service Wants=blitz-5g-dial.service -Wants=blitz-time-sync.service Wants=blitz-ros-receiver.service Wants=blitz-b-side-omnid.service +Wants=blitz-watchdog.service After=multi-user.target [Install] diff --git a/scripts/boot/systemd/blitz-ros-receiver.service.in b/scripts/boot/systemd/blitz-ros-receiver.service.in index fbafd6c..437c136 100644 --- a/scripts/boot/systemd/blitz-ros-receiver.service.in +++ b/scripts/boot/systemd/blitz-ros-receiver.service.in @@ -1,11 +1,13 @@ [Unit] Description=Blitz robot ROS receiver -After=blitz-time-sync.service -Wants=blitz-time-sync.service +After=blitz-5g-dial.service +Wants=blitz-5g-dial.service [Service] Type=simple User=@BLITZ_ROS_USER@ +PermissionsStartOnly=true +ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/start-ros-receiver-service.sh ExecStartPost=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/wait-for-unix-socket.sh --step ros-receiver Restart=always diff --git a/scripts/boot/systemd/blitz-time-sync.service.in b/scripts/boot/systemd/blitz-time-sync.service.in deleted file mode 100644 index 36fd79b..0000000 --- a/scripts/boot/systemd/blitz-time-sync.service.in +++ /dev/null @@ -1,14 +0,0 @@ -[Unit] -Description=Blitz robot private chrony sync -After=blitz-5g-dial.service -Wants=blitz-5g-dial.service - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/time-sync.sh -StandardOutput=append:@BLITZ_LOG_FILE@ -StandardError=append:@BLITZ_LOG_FILE@ - -[Install] -WantedBy=blitz-robot.target diff --git a/scripts/boot/systemd/blitz-watchdog.service.in b/scripts/boot/systemd/blitz-watchdog.service.in new file mode 100644 index 0000000..ad246a9 --- /dev/null +++ b/scripts/boot/systemd/blitz-watchdog.service.in @@ -0,0 +1,16 @@ +[Unit] +Description=Blitz robot health watchdog +After=blitz-b-side-omnid.service blitz-ros-receiver.service +Wants=blitz-b-side-omnid.service blitz-ros-receiver.service + +[Service] +Type=simple +ExecStartPre=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/prepare-runtime-dir.sh +ExecStart=/bin/bash @OMNISOCKETGO_ROOT@/scripts/boot/blitz-watchdog.sh +Restart=always +RestartSec=5 +StandardOutput=append:@BLITZ_LOG_FILE@ +StandardError=append:@BLITZ_LOG_FILE@ + +[Install] +WantedBy=blitz-robot.target diff --git a/scripts/boot/time-sync.sh b/scripts/boot/time-sync.sh deleted file mode 100644 index 27434ce..0000000 --- a/scripts/boot/time-sync.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck disable=SC1091 -source "${SCRIPT_DIR}/common.sh" - -STEP="time-sync" -CHRONY_SOURCES_DIR="/etc/chrony/sources.d" -CHRONY_SOURCE_FILE="${CHRONY_SOURCES_DIR}/blitz-robot.sources" -CHRONY_MAIN_CONF="/etc/chrony/chrony.conf" -CHRONY_MAIN_CONF_BAK="/etc/chrony/chrony.conf.blitz-bak" -CHRONY_BURST_SAMPLES="${CHRONY_BURST_SAMPLES:-1/2}" - -chrony_unit_name() { - if systemctl list-unit-files chrony.service --no-legend 2>/dev/null | grep -q '^chrony\.service'; then - printf '%s\n' "chrony.service" - return 0 - fi - if systemctl list-unit-files chronyd.service --no-legend 2>/dev/null | grep -q '^chronyd\.service'; then - printf '%s\n' "chronyd.service" - return 0 - fi - printf '%s\n' "chrony.service" -} - -ensure_chrony_main_conf() { - local temp_file - - blitz_require_file "${CHRONY_MAIN_CONF}" "${STEP}" - mkdir -p "${CHRONY_SOURCES_DIR}" - - if [[ ! -f "${CHRONY_MAIN_CONF_BAK}" ]]; then - cp -a "${CHRONY_MAIN_CONF}" "${CHRONY_MAIN_CONF_BAK}" - blitz_log "${STEP}" "backup-config" "success" "backup=${CHRONY_MAIN_CONF_BAK}" 0 - fi - - temp_file="$(mktemp)" - awk ' - /^[[:space:]]*#/ { print; next } - /^[[:space:]]*(pool|server)[[:space:]]+/ { - print "# blitz-managed-disabled " $0 - next - } - { print } - ' "${CHRONY_MAIN_CONF}" > "${temp_file}" - - if ! grep -Eq '^[[:space:]]*sourcedir[[:space:]]+/etc/chrony/sources\.d([[:space:]]|$)' "${temp_file}"; then - printf '\n# blitz-managed\nsourcedir /etc/chrony/sources.d\n' >> "${temp_file}" - fi - - if ! cmp -s "${temp_file}" "${CHRONY_MAIN_CONF}"; then - cp "${temp_file}" "${CHRONY_MAIN_CONF}" - blitz_log "${STEP}" "rewrite-main-config" "success" "commented non-Blitz pool/server entries in ${CHRONY_MAIN_CONF}" 0 - else - blitz_log "${STEP}" "rewrite-main-config" "success" "main config already matches Blitz expectations" 0 - fi - - rm -f "${temp_file}" -} - -write_chrony_source_file() { - local temp_file - - temp_file="$(mktemp)" - cat < "${temp_file}" -# blitz-managed -server ${BLITZ_TIME_SERVER_IP} port ${BLITZ_TIME_SERVER_PORT} iburst -EOF - - if [[ ! -f "${CHRONY_SOURCE_FILE}" ]] || ! cmp -s "${temp_file}" "${CHRONY_SOURCE_FILE}"; then - cp "${temp_file}" "${CHRONY_SOURCE_FILE}" - blitz_log "${STEP}" "write-source" "success" "source_file=${CHRONY_SOURCE_FILE} server=${BLITZ_TIME_SERVER_IP} port=${BLITZ_TIME_SERVER_PORT}" 0 - else - blitz_log "${STEP}" "write-source" "success" "source_file already matches ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT}" 0 - fi - - rm -f "${temp_file}" -} - -blitz_load_boot_env -blitz_require_root "${STEP}" -blitz_require_command systemctl "${STEP}" -blitz_require_command chronyc "${STEP}" - -if [[ -z "${BLITZ_TIME_SERVER_IP}" ]]; then - blitz_log "${STEP}" "precheck" "failure" "BLITZ_TIME_SERVER_IP is empty and no fallback could be derived" 1 - exit 1 -fi -if ! [[ "${BLITZ_TIME_SERVER_PORT}" =~ ^[0-9]+$ ]] || (( BLITZ_TIME_SERVER_PORT < 1 || BLITZ_TIME_SERVER_PORT > 65535 )); then - blitz_log "${STEP}" "precheck" "failure" "BLITZ_TIME_SERVER_PORT must be an integer between 1 and 65535" 1 - exit 1 -fi - -ensure_chrony_main_conf -write_chrony_source_file - -CHRONY_UNIT="$(chrony_unit_name)" -blitz_run "${STEP}" "restart-chrony" systemctl restart "${CHRONY_UNIT}" -blitz_run "${STEP}" "burst" chronyc burst "${CHRONY_BURST_SAMPLES}" - -blitz_log "${STEP}" "waitsync" "start" "server=${BLITZ_TIME_SERVER_IP} port=${BLITZ_TIME_SERVER_PORT} wait_sec=${BLITZ_TIME_SYNC_WAIT_SEC} max_offset_sec=${BLITZ_TIME_SYNC_MAX_OFFSET_SEC} interval_sec=${BLITZ_TIME_SYNC_INTERVAL_SEC}" 0 -if chronyc waitsync "${BLITZ_TIME_SYNC_WAIT_SEC}" "${BLITZ_TIME_SYNC_MAX_OFFSET_SEC}" 1000 "${BLITZ_TIME_SYNC_INTERVAL_SEC}"; then - blitz_log "${STEP}" "waitsync" "success" "chrony synchronized to ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT}" 0 -else - rc=$? - blitz_log "${STEP}" "waitsync" "soft_fail" "chrony did not synchronize to ${BLITZ_TIME_SERVER_IP}:${BLITZ_TIME_SERVER_PORT} within the configured timeout" "${rc}" -fi - -blitz_log "${STEP}" "tracking" "start" "chronyc tracking" 0 -chronyc tracking || true -blitz_log "${STEP}" "sources" "start" "chronyc sources -v" 0 -chronyc sources -v || true -blitz_log "${STEP}" "complete" "success" "time-sync step finished" 0 diff --git a/src/video_pipeline.c b/src/video_pipeline.c index 9dafbea..a35c062 100644 --- a/src/video_pipeline.c +++ b/src/video_pipeline.c @@ -180,6 +180,13 @@ static void video_pipeline_set_errno_error(video_pipeline_stats_t *stats, const video_pipeline_set_error(stats, buffer); } +static void video_pipeline_report_progress(const video_pipeline_config_t *config) { + if (config == NULL || config->progress_callback == NULL) { + return; + } + config->progress_callback(config->progress_context); +} + void video_pipeline_config_init(video_pipeline_config_t *config) { if (config == NULL) { return; @@ -853,6 +860,7 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta memset(&transport_stats, 0, sizeof(transport_stats)); memset(&packet_metadata, 0, sizeof(packet_metadata)); + video_pipeline_report_progress(config); if (config->max_frames > 0 && frame_index >= config->max_frames) { break; diff --git a/src/video_pipeline_gps.c b/src/video_pipeline_gps.c index eadf31f..13933a9 100644 --- a/src/video_pipeline_gps.c +++ b/src/video_pipeline_gps.c @@ -161,6 +161,13 @@ static void video_pipeline_set_errno_error(video_pipeline_stats_t *stats, const video_pipeline_set_error(stats, buffer); } +static void video_pipeline_report_progress(const video_pipeline_config_t *config) { + if (config == NULL || config->progress_callback == NULL) { + return; + } + config->progress_callback(config->progress_context); +} + void video_pipeline_config_init(video_pipeline_config_t *config) { if (config == NULL) { return; @@ -757,6 +764,8 @@ int video_pipeline_run(const video_pipeline_config_t *config, video_pipeline_sta double send_end_ms = 0.0; int frame_number = frame_index + 1; + video_pipeline_report_progress(config); + if (config->max_frames > 0 && frame_index >= config->max_frames) { break; }