From 62820cabbcfd67f429b05c4456a344182cb55fdf Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 09:10:32 +0800 Subject: [PATCH] fix(runner): recover disabled non110 service from keepalive --- .../check-awoooi-non110-runner-readiness.sh | 38 +++++++++++++++- ...stall-awoooi-non110-runner-user-service.sh | 5 ++- ...st_check_awoooi_non110_runner_readiness.py | 44 +++++++++++++++++++ ...stall_awoooi_non110_runner_user_service.py | 9 +++- 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/ops/runner/check-awoooi-non110-runner-readiness.sh b/ops/runner/check-awoooi-non110-runner-readiness.sh index 4f66bc6d..91a2c83e 100755 --- a/ops/runner/check-awoooi-non110-runner-readiness.sh +++ b/ops/runner/check-awoooi-non110-runner-readiness.sh @@ -15,6 +15,7 @@ RUNNER_DOCKER_IMAGES="${RUNNER_DOCKER_IMAGES:-gitea/act_runner:latest}" RUNNER_REGISTRATION_PATHS="${RUNNER_REGISTRATION_PATHS:-${RUNNER_HOME}/awoooi-non110-runner/data/.runner ${RUNNER_HOME}/awoooi-non110-runner/.runner ${RUNNER_HOME}/act-runner-awoooi/.runner /home/wooo/act-runner-awoooi/.runner /home/wooo/awoooi-act-runner/.runner /home/wooo/awoooi-non110-runner/.runner /home/wooo/act-runner/.runner}" RUNNER_SERVICE_NAMES="${RUNNER_SERVICE_NAMES:-awoooi-non110-runner.service gitea-act-runner-awoooi.service gitea-act-runner-host.service}" RUNNER_AUTOSTART_PATH_UNIT_NAMES="${RUNNER_AUTOSTART_PATH_UNIT_NAMES:-awoooi-non110-runner-autostart.path}" +RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES="${RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES:-awoooi-non110-runner-keepalive.service}" RUNNER_KEEPALIVE_TIMER_UNIT_NAMES="${RUNNER_KEEPALIVE_TIMER_UNIT_NAMES:-awoooi-non110-runner-keepalive.timer}" ALLOWED_RUNNER_CONTAINER_NAMES="${ALLOWED_RUNNER_CONTAINER_NAMES:-awoooi-non110-runner stockplatform-ubuntu-runner}" ALLOWED_LABEL_NAMES="${ALLOWED_LABEL_NAMES:-awoooi-non110-host awoooi-non110-ubuntu awoooi-host awoooi-ubuntu}" @@ -33,6 +34,7 @@ READY_REGISTRATION_COUNT=0 READY_SERVICE_COUNT=0 READY_ACTIVE_SERVICE_COUNT=0 READY_AUTOSTART_PATH_COUNT=0 +READY_KEEPALIVE_SERVICE_COUNT=0 READY_KEEPALIVE_TIMER_COUNT=0 section() { @@ -85,12 +87,12 @@ systemd_cat() { systemd_show() { local unit="$1" local out - if out="$(systemctl show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID --no-pager 2>/dev/null)" \ + if out="$(systemctl show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID -p Result -p ExecMainStatus --no-pager 2>/dev/null)" \ && ! grep -q '^LoadState=not-found$' <<<"$out"; then printf '%s\n' "$out" return 0 fi - if out="$(systemctl --user show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID --no-pager 2>/dev/null)" \ + if out="$(systemctl --user show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID -p Result -p ExecMainStatus --no-pager 2>/dev/null)" \ && ! grep -q '^LoadState=not-found$' <<<"$out"; then printf '%s\n' "$out" return 0 @@ -446,6 +448,32 @@ check_autostart_paths() { } check_keepalive_timers() { + section "runner keepalive service metadata" + local service_unit service_text service_state + for service_unit in $RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES; do + if ! service_text="$(systemd_cat "$service_unit" 2>/dev/null)"; then + printf 'RUNNER_KEEPALIVE_SERVICE unit=%s installed=0\n' "$service_unit" + continue + fi + service_state="$(systemd_show "$service_unit" | tr '\n' ' ' || true)" + if grep -q 'LoadState=not-found' <<<"$service_state"; then + printf 'RUNNER_KEEPALIVE_SERVICE unit=%s installed=0\n' "$service_unit" + continue + fi + printf 'RUNNER_KEEPALIVE_SERVICE unit=%s installed=1 %s\n' "$service_unit" "$service_state" + if grep -q 'ActiveState=failed' <<<"$service_state" \ + || grep -Eq 'Result=(exit-code|signal|timeout|core-dump)' <<<"$service_state"; then + blocker "runner_keepalive_service_failed:${service_unit}" + continue + fi + if grep -Eq '^[[:space:]]*ExecStart=-/usr/bin/systemctl --user reset-failed ' <<<"$service_text" \ + && grep -Eq '^[[:space:]]*ExecStart=/usr/bin/systemctl --user daemon-reload' <<<"$service_text"; then + READY_KEEPALIVE_SERVICE_COUNT=$((READY_KEEPALIVE_SERVICE_COUNT + 1)) + else + blocker "runner_keepalive_service_recovery_steps_missing:${service_unit}" + fi + done + section "runner keepalive metadata" local unit text state active enabled interval for unit in $RUNNER_KEEPALIVE_TIMER_UNIT_NAMES; do @@ -477,6 +505,11 @@ check_keepalive_timers() { fi done + if [ "$READY_REGISTRATION_COUNT" -gt 0 ] \ + && [ "$READY_SERVICE_COUNT" -gt 0 ] \ + && [ "$READY_KEEPALIVE_SERVICE_COUNT" -eq 0 ]; then + blocker "runner_keepalive_service_not_ready" + fi if [ "$READY_REGISTRATION_COUNT" -gt 0 ] \ && [ "$READY_SERVICE_COUNT" -gt 0 ] \ && [ "$READY_KEEPALIVE_TIMER_COUNT" -eq 0 ]; then @@ -543,6 +576,7 @@ main() { printf 'READY_SERVICE_COUNT=%s\n' "$READY_SERVICE_COUNT" printf 'READY_ACTIVE_SERVICE_COUNT=%s\n' "$READY_ACTIVE_SERVICE_COUNT" printf 'READY_AUTOSTART_PATH_COUNT=%s\n' "$READY_AUTOSTART_PATH_COUNT" + printf 'READY_KEEPALIVE_SERVICE_COUNT=%s\n' "$READY_KEEPALIVE_SERVICE_COUNT" printf 'READY_KEEPALIVE_TIMER_COUNT=%s\n' "$READY_KEEPALIVE_TIMER_COUNT" printf 'WARNING_COUNT=%s\n' "${#WARNINGS[@]}" printf 'BLOCKER_COUNT=%s\n' "${#BLOCKERS[@]}" diff --git a/ops/runner/install-awoooi-non110-runner-user-service.sh b/ops/runner/install-awoooi-non110-runner-user-service.sh index 43f9b951..06599a79 100755 --- a/ops/runner/install-awoooi-non110-runner-user-service.sh +++ b/ops/runner/install-awoooi-non110-runner-user-service.sh @@ -18,7 +18,7 @@ AUTOSTART_SERVICE_NAME="${AUTOSTART_SERVICE_NAME:-awoooi-non110-runner-autostart AUTOSTART_PATH_NAME="${AUTOSTART_PATH_NAME:-awoooi-non110-runner-autostart.path}" KEEPALIVE_SERVICE_NAME="${KEEPALIVE_SERVICE_NAME:-awoooi-non110-runner-keepalive.service}" KEEPALIVE_TIMER_NAME="${KEEPALIVE_TIMER_NAME:-awoooi-non110-runner-keepalive.timer}" -KEEPALIVE_INTERVAL_SECONDS="${KEEPALIVE_INTERVAL_SECONDS:-60}" +KEEPALIVE_INTERVAL_SECONDS="${KEEPALIVE_INTERVAL_SECONDS:-15}" USER_SERVICE_DIR="${USER_SERVICE_DIR:-${RUNNER_HOME}/.config/systemd/user}" RUNNER_LABELS="${RUNNER_LABELS:-awoooi-non110-host:host,awoooi-non110-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04}" WRITE_CONFIG_IF_MISSING="${WRITE_CONFIG_IF_MISSING:-1}" @@ -245,7 +245,8 @@ Type=oneshot ExecStart=/usr/bin/test -x ${RUNNER_BINARY} ExecStart=/usr/bin/test -s ${RUNNER_CONFIG} ExecStart=/usr/bin/test -s ${RUNNER_REGISTRATION} -ExecStart=/usr/bin/systemctl --user reset-failed ${SERVICE_NAME} +ExecStart=/usr/bin/systemctl --user daemon-reload +ExecStart=-/usr/bin/systemctl --user reset-failed ${SERVICE_NAME} ExecStart=/usr/bin/systemctl --user enable ${SERVICE_NAME} ExecStart=/usr/bin/systemctl --user start ${SERVICE_NAME} RemainAfterExit=no diff --git a/ops/runner/test_check_awoooi_non110_runner_readiness.py b/ops/runner/test_check_awoooi_non110_runner_readiness.py index 8d3f4384..aec6961e 100644 --- a/ops/runner/test_check_awoooi_non110_runner_readiness.py +++ b/ops/runner/test_check_awoooi_non110_runner_readiness.py @@ -93,6 +93,25 @@ WantedBy=timers.target ) +def _write_keepalive_service(path: Path) -> None: + path.write_text( + """ +[Unit] +Description=Keep AWOOOI non-110 runner active while enable sentinel exists + +[Service] +Type=oneshot +ExecStart=/usr/bin/systemctl --user daemon-reload +ExecStart=-/usr/bin/systemctl --user reset-failed awoooi-non110-runner.service +ExecStart=/usr/bin/systemctl --user enable awoooi-non110-runner.service +ExecStart=/usr/bin/systemctl --user start awoooi-non110-runner.service +RemainAfterExit=no +""".strip() + + "\n", + encoding="utf-8", + ) + + def _run_verifier( tmp_path: Path, registration_path: Path, @@ -101,6 +120,7 @@ def _run_verifier( unit_target_matches: bool = True, unmanaged_runner_container: bool = False, keepalive_timer: bool = True, + keepalive_service_failed: bool = False, ) -> subprocess.CompletedProcess[str]: fake_bin = tmp_path / "bin" unit_dir = tmp_path / "units" @@ -127,6 +147,10 @@ case "$cmd" in printf 'LoadState=loaded\\nActiveState=active\\nUnitFileState=enabled\\nMainPID=0\\n' exit 0 fi + if [[ "$unit" == *keepalive.service ]]; then + printf 'LoadState=loaded\\nActiveState={"failed" if keepalive_service_failed else "inactive"}\\nUnitFileState=static\\nMainPID=0\\nResult={"exit-code" if keepalive_service_failed else "success"}\\nExecMainStatus={"1" if keepalive_service_failed else "0"}\\n' + exit 0 + fi printf 'LoadState=loaded\\nActiveState={"active" if active_service else "inactive"}\\nUnitFileState=disabled\\nMainPID={"1234" if active_service else "0"}\\n' exit 0 fi @@ -208,6 +232,7 @@ exit 1 registration_path, ) if keepalive_timer: + _write_keepalive_service(unit_dir / "awoooi-non110-runner-keepalive.service") _write_keepalive_timer(unit_dir / "awoooi-non110-runner-keepalive.timer") env = { @@ -219,6 +244,7 @@ exit 1 "RUNNER_BINARY_PATHS": str(binary_path), "RUNNER_SERVICE_NAMES": "awoooi-non110-runner.service", "RUNNER_AUTOSTART_PATH_UNIT_NAMES": "awoooi-non110-runner-autostart.path", + "RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES": "awoooi-non110-runner-keepalive.service", "RUNNER_KEEPALIVE_TIMER_UNIT_NAMES": "awoooi-non110-runner-keepalive.timer", "ROLLBACK_UNIT_NAMES": "awoooi-non110-runner-rollback.service", "RUNNER_REGISTRATION_PATHS": str(registration_path), @@ -252,6 +278,7 @@ def test_non110_readiness_blocks_without_registration_state(tmp_path: Path) -> N assert "READY_AUTOSTART_PATH_COUNT=1" in result.stdout assert "RUNNER_KEEPALIVE_TIMER unit=awoooi-non110-runner-keepalive.timer installed=1" in result.stdout assert "BLOCKER runner_keepalive_timer_not_ready" not in result.stdout + assert "BLOCKER runner_keepalive_service_not_ready" not in result.stdout def test_non110_readiness_accepts_registration_state_presence_without_reading_it( @@ -265,12 +292,29 @@ def test_non110_readiness_accepts_registration_state_presence_without_reading_it assert "present=1" in result.stdout assert "content_read=false" in result.stdout assert "registration_condition=1" in result.stdout + assert "RUNNER_KEEPALIVE_SERVICE unit=awoooi-non110-runner-keepalive.service installed=1" in result.stdout + assert "READY_KEEPALIVE_SERVICE_COUNT=1" in result.stdout assert "RUNNER_KEEPALIVE_TIMER unit=awoooi-non110-runner-keepalive.timer installed=1" in result.stdout assert "READY_KEEPALIVE_TIMER_COUNT=1" in result.stdout assert "secret-token-like-content" not in result.stdout assert "AWOOOI_NON110_RUNNER_READY=1" in result.stdout +def test_non110_readiness_blocks_failed_keepalive_service(tmp_path: Path) -> None: + registration_path = tmp_path / ".runner" + registration_path.write_text("secret-token-like-content-not-printed\n", encoding="utf-8") + result = _run_verifier( + tmp_path, + registration_path, + keepalive_service_failed=True, + ) + assert result.returncode == 1 + assert "BLOCKER runner_keepalive_service_failed:awoooi-non110-runner-keepalive.service" in result.stdout + assert "BLOCKER runner_keepalive_service_not_ready" in result.stdout + assert "secret-token-like-content" not in result.stdout + assert "AWOOOI_NON110_RUNNER_READY=0" in result.stdout + + def test_non110_readiness_blocks_registered_runner_without_keepalive_timer( tmp_path: Path, ) -> None: diff --git a/ops/runner/test_install_awoooi_non110_runner_user_service.py b/ops/runner/test_install_awoooi_non110_runner_user_service.py index 2438dda0..5ed55562 100644 --- a/ops/runner/test_install_awoooi_non110_runner_user_service.py +++ b/ops/runner/test_install_awoooi_non110_runner_user_service.py @@ -96,12 +96,19 @@ def test_apply_with_existing_registration_does_not_start_runner(tmp_path: Path) unit_dir = tmp_path / "home/.config/systemd/user" autostart = unit_dir / "awoooi-non110-runner-autostart.service" + keepalive_service = unit_dir / "awoooi-non110-runner-keepalive.service" keepalive_timer = unit_dir / "awoooi-non110-runner-keepalive.timer" assert "ConditionPathExists=!" in autostart.read_text(encoding="utf-8") assert "enable --now awoooi-non110-runner-keepalive.timer" in autostart.read_text( encoding="utf-8" ) - assert "OnUnitInactiveSec=60s" in keepalive_timer.read_text(encoding="utf-8") + keepalive_service_text = keepalive_service.read_text(encoding="utf-8") + assert "ExecStart=/usr/bin/systemctl --user daemon-reload" in keepalive_service_text + assert ( + "ExecStart=-/usr/bin/systemctl --user reset-failed awoooi-non110-runner.service" + in keepalive_service_text + ) + assert "OnUnitInactiveSec=15s" in keepalive_timer.read_text(encoding="utf-8") log = (tmp_path / "systemctl.log").read_text(encoding="utf-8") assert "enable --now awoooi-non110-runner-autostart.path" not in log