fix(runner): recover disabled non110 service from keepalive
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 56s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 09:10:32 +08:00
parent 4aaa95c448
commit 62820cabbc
4 changed files with 91 additions and 5 deletions

View File

@@ -15,6 +15,7 @@ RUNNER_DOCKER_IMAGES="${RUNNER_DOCKER_IMAGES:-gitea/act_runner:latest}"
RUNNER_REGISTRATION_PATHS="${RUNNER_REGISTRATION_PATHS:-${RUNNER_HOME}/awoooi-non110-runner/data/.runner ${RUNNER_HOME}/awoooi-non110-runner/.runner ${RUNNER_HOME}/act-runner-awoooi/.runner /home/wooo/act-runner-awoooi/.runner /home/wooo/awoooi-act-runner/.runner /home/wooo/awoooi-non110-runner/.runner /home/wooo/act-runner/.runner}"
RUNNER_SERVICE_NAMES="${RUNNER_SERVICE_NAMES:-awoooi-non110-runner.service gitea-act-runner-awoooi.service gitea-act-runner-host.service}"
RUNNER_AUTOSTART_PATH_UNIT_NAMES="${RUNNER_AUTOSTART_PATH_UNIT_NAMES:-awoooi-non110-runner-autostart.path}"
RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES="${RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES:-awoooi-non110-runner-keepalive.service}"
RUNNER_KEEPALIVE_TIMER_UNIT_NAMES="${RUNNER_KEEPALIVE_TIMER_UNIT_NAMES:-awoooi-non110-runner-keepalive.timer}"
ALLOWED_RUNNER_CONTAINER_NAMES="${ALLOWED_RUNNER_CONTAINER_NAMES:-awoooi-non110-runner stockplatform-ubuntu-runner}"
ALLOWED_LABEL_NAMES="${ALLOWED_LABEL_NAMES:-awoooi-non110-host awoooi-non110-ubuntu awoooi-host awoooi-ubuntu}"
@@ -33,6 +34,7 @@ READY_REGISTRATION_COUNT=0
READY_SERVICE_COUNT=0
READY_ACTIVE_SERVICE_COUNT=0
READY_AUTOSTART_PATH_COUNT=0
READY_KEEPALIVE_SERVICE_COUNT=0
READY_KEEPALIVE_TIMER_COUNT=0
section() {
@@ -85,12 +87,12 @@ systemd_cat() {
systemd_show() {
local unit="$1"
local out
if out="$(systemctl show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID --no-pager 2>/dev/null)" \
if out="$(systemctl show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID -p Result -p ExecMainStatus --no-pager 2>/dev/null)" \
&& ! grep -q '^LoadState=not-found$' <<<"$out"; then
printf '%s\n' "$out"
return 0
fi
if out="$(systemctl --user show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID --no-pager 2>/dev/null)" \
if out="$(systemctl --user show "$unit" -p LoadState -p ActiveState -p UnitFileState -p MainPID -p Result -p ExecMainStatus --no-pager 2>/dev/null)" \
&& ! grep -q '^LoadState=not-found$' <<<"$out"; then
printf '%s\n' "$out"
return 0
@@ -446,6 +448,32 @@ check_autostart_paths() {
}
check_keepalive_timers() {
section "runner keepalive service metadata"
local service_unit service_text service_state
for service_unit in $RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES; do
if ! service_text="$(systemd_cat "$service_unit" 2>/dev/null)"; then
printf 'RUNNER_KEEPALIVE_SERVICE unit=%s installed=0\n' "$service_unit"
continue
fi
service_state="$(systemd_show "$service_unit" | tr '\n' ' ' || true)"
if grep -q 'LoadState=not-found' <<<"$service_state"; then
printf 'RUNNER_KEEPALIVE_SERVICE unit=%s installed=0\n' "$service_unit"
continue
fi
printf 'RUNNER_KEEPALIVE_SERVICE unit=%s installed=1 %s\n' "$service_unit" "$service_state"
if grep -q 'ActiveState=failed' <<<"$service_state" \
|| grep -Eq 'Result=(exit-code|signal|timeout|core-dump)' <<<"$service_state"; then
blocker "runner_keepalive_service_failed:${service_unit}"
continue
fi
if grep -Eq '^[[:space:]]*ExecStart=-/usr/bin/systemctl --user reset-failed ' <<<"$service_text" \
&& grep -Eq '^[[:space:]]*ExecStart=/usr/bin/systemctl --user daemon-reload' <<<"$service_text"; then
READY_KEEPALIVE_SERVICE_COUNT=$((READY_KEEPALIVE_SERVICE_COUNT + 1))
else
blocker "runner_keepalive_service_recovery_steps_missing:${service_unit}"
fi
done
section "runner keepalive metadata"
local unit text state active enabled interval
for unit in $RUNNER_KEEPALIVE_TIMER_UNIT_NAMES; do
@@ -477,6 +505,11 @@ check_keepalive_timers() {
fi
done
if [ "$READY_REGISTRATION_COUNT" -gt 0 ] \
&& [ "$READY_SERVICE_COUNT" -gt 0 ] \
&& [ "$READY_KEEPALIVE_SERVICE_COUNT" -eq 0 ]; then
blocker "runner_keepalive_service_not_ready"
fi
if [ "$READY_REGISTRATION_COUNT" -gt 0 ] \
&& [ "$READY_SERVICE_COUNT" -gt 0 ] \
&& [ "$READY_KEEPALIVE_TIMER_COUNT" -eq 0 ]; then
@@ -543,6 +576,7 @@ main() {
printf 'READY_SERVICE_COUNT=%s\n' "$READY_SERVICE_COUNT"
printf 'READY_ACTIVE_SERVICE_COUNT=%s\n' "$READY_ACTIVE_SERVICE_COUNT"
printf 'READY_AUTOSTART_PATH_COUNT=%s\n' "$READY_AUTOSTART_PATH_COUNT"
printf 'READY_KEEPALIVE_SERVICE_COUNT=%s\n' "$READY_KEEPALIVE_SERVICE_COUNT"
printf 'READY_KEEPALIVE_TIMER_COUNT=%s\n' "$READY_KEEPALIVE_TIMER_COUNT"
printf 'WARNING_COUNT=%s\n' "${#WARNINGS[@]}"
printf 'BLOCKER_COUNT=%s\n' "${#BLOCKERS[@]}"

View File

@@ -18,7 +18,7 @@ AUTOSTART_SERVICE_NAME="${AUTOSTART_SERVICE_NAME:-awoooi-non110-runner-autostart
AUTOSTART_PATH_NAME="${AUTOSTART_PATH_NAME:-awoooi-non110-runner-autostart.path}"
KEEPALIVE_SERVICE_NAME="${KEEPALIVE_SERVICE_NAME:-awoooi-non110-runner-keepalive.service}"
KEEPALIVE_TIMER_NAME="${KEEPALIVE_TIMER_NAME:-awoooi-non110-runner-keepalive.timer}"
KEEPALIVE_INTERVAL_SECONDS="${KEEPALIVE_INTERVAL_SECONDS:-60}"
KEEPALIVE_INTERVAL_SECONDS="${KEEPALIVE_INTERVAL_SECONDS:-15}"
USER_SERVICE_DIR="${USER_SERVICE_DIR:-${RUNNER_HOME}/.config/systemd/user}"
RUNNER_LABELS="${RUNNER_LABELS:-awoooi-non110-host:host,awoooi-non110-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04}"
WRITE_CONFIG_IF_MISSING="${WRITE_CONFIG_IF_MISSING:-1}"
@@ -245,7 +245,8 @@ Type=oneshot
ExecStart=/usr/bin/test -x ${RUNNER_BINARY}
ExecStart=/usr/bin/test -s ${RUNNER_CONFIG}
ExecStart=/usr/bin/test -s ${RUNNER_REGISTRATION}
ExecStart=/usr/bin/systemctl --user reset-failed ${SERVICE_NAME}
ExecStart=/usr/bin/systemctl --user daemon-reload
ExecStart=-/usr/bin/systemctl --user reset-failed ${SERVICE_NAME}
ExecStart=/usr/bin/systemctl --user enable ${SERVICE_NAME}
ExecStart=/usr/bin/systemctl --user start ${SERVICE_NAME}
RemainAfterExit=no

View File

@@ -93,6 +93,25 @@ WantedBy=timers.target
)
def _write_keepalive_service(path: Path) -> None:
path.write_text(
"""
[Unit]
Description=Keep AWOOOI non-110 runner active while enable sentinel exists
[Service]
Type=oneshot
ExecStart=/usr/bin/systemctl --user daemon-reload
ExecStart=-/usr/bin/systemctl --user reset-failed awoooi-non110-runner.service
ExecStart=/usr/bin/systemctl --user enable awoooi-non110-runner.service
ExecStart=/usr/bin/systemctl --user start awoooi-non110-runner.service
RemainAfterExit=no
""".strip()
+ "\n",
encoding="utf-8",
)
def _run_verifier(
tmp_path: Path,
registration_path: Path,
@@ -101,6 +120,7 @@ def _run_verifier(
unit_target_matches: bool = True,
unmanaged_runner_container: bool = False,
keepalive_timer: bool = True,
keepalive_service_failed: bool = False,
) -> subprocess.CompletedProcess[str]:
fake_bin = tmp_path / "bin"
unit_dir = tmp_path / "units"
@@ -127,6 +147,10 @@ case "$cmd" in
printf 'LoadState=loaded\\nActiveState=active\\nUnitFileState=enabled\\nMainPID=0\\n'
exit 0
fi
if [[ "$unit" == *keepalive.service ]]; then
printf 'LoadState=loaded\\nActiveState={"failed" if keepalive_service_failed else "inactive"}\\nUnitFileState=static\\nMainPID=0\\nResult={"exit-code" if keepalive_service_failed else "success"}\\nExecMainStatus={"1" if keepalive_service_failed else "0"}\\n'
exit 0
fi
printf 'LoadState=loaded\\nActiveState={"active" if active_service else "inactive"}\\nUnitFileState=disabled\\nMainPID={"1234" if active_service else "0"}\\n'
exit 0
fi
@@ -208,6 +232,7 @@ exit 1
registration_path,
)
if keepalive_timer:
_write_keepalive_service(unit_dir / "awoooi-non110-runner-keepalive.service")
_write_keepalive_timer(unit_dir / "awoooi-non110-runner-keepalive.timer")
env = {
@@ -219,6 +244,7 @@ exit 1
"RUNNER_BINARY_PATHS": str(binary_path),
"RUNNER_SERVICE_NAMES": "awoooi-non110-runner.service",
"RUNNER_AUTOSTART_PATH_UNIT_NAMES": "awoooi-non110-runner-autostart.path",
"RUNNER_KEEPALIVE_SERVICE_UNIT_NAMES": "awoooi-non110-runner-keepalive.service",
"RUNNER_KEEPALIVE_TIMER_UNIT_NAMES": "awoooi-non110-runner-keepalive.timer",
"ROLLBACK_UNIT_NAMES": "awoooi-non110-runner-rollback.service",
"RUNNER_REGISTRATION_PATHS": str(registration_path),
@@ -252,6 +278,7 @@ def test_non110_readiness_blocks_without_registration_state(tmp_path: Path) -> N
assert "READY_AUTOSTART_PATH_COUNT=1" in result.stdout
assert "RUNNER_KEEPALIVE_TIMER unit=awoooi-non110-runner-keepalive.timer installed=1" in result.stdout
assert "BLOCKER runner_keepalive_timer_not_ready" not in result.stdout
assert "BLOCKER runner_keepalive_service_not_ready" not in result.stdout
def test_non110_readiness_accepts_registration_state_presence_without_reading_it(
@@ -265,12 +292,29 @@ def test_non110_readiness_accepts_registration_state_presence_without_reading_it
assert "present=1" in result.stdout
assert "content_read=false" in result.stdout
assert "registration_condition=1" in result.stdout
assert "RUNNER_KEEPALIVE_SERVICE unit=awoooi-non110-runner-keepalive.service installed=1" in result.stdout
assert "READY_KEEPALIVE_SERVICE_COUNT=1" in result.stdout
assert "RUNNER_KEEPALIVE_TIMER unit=awoooi-non110-runner-keepalive.timer installed=1" in result.stdout
assert "READY_KEEPALIVE_TIMER_COUNT=1" in result.stdout
assert "secret-token-like-content" not in result.stdout
assert "AWOOOI_NON110_RUNNER_READY=1" in result.stdout
def test_non110_readiness_blocks_failed_keepalive_service(tmp_path: Path) -> None:
registration_path = tmp_path / ".runner"
registration_path.write_text("secret-token-like-content-not-printed\n", encoding="utf-8")
result = _run_verifier(
tmp_path,
registration_path,
keepalive_service_failed=True,
)
assert result.returncode == 1
assert "BLOCKER runner_keepalive_service_failed:awoooi-non110-runner-keepalive.service" in result.stdout
assert "BLOCKER runner_keepalive_service_not_ready" in result.stdout
assert "secret-token-like-content" not in result.stdout
assert "AWOOOI_NON110_RUNNER_READY=0" in result.stdout
def test_non110_readiness_blocks_registered_runner_without_keepalive_timer(
tmp_path: Path,
) -> None:

View File

@@ -96,12 +96,19 @@ def test_apply_with_existing_registration_does_not_start_runner(tmp_path: Path)
unit_dir = tmp_path / "home/.config/systemd/user"
autostart = unit_dir / "awoooi-non110-runner-autostart.service"
keepalive_service = unit_dir / "awoooi-non110-runner-keepalive.service"
keepalive_timer = unit_dir / "awoooi-non110-runner-keepalive.timer"
assert "ConditionPathExists=!" in autostart.read_text(encoding="utf-8")
assert "enable --now awoooi-non110-runner-keepalive.timer" in autostart.read_text(
encoding="utf-8"
)
assert "OnUnitInactiveSec=60s" in keepalive_timer.read_text(encoding="utf-8")
keepalive_service_text = keepalive_service.read_text(encoding="utf-8")
assert "ExecStart=/usr/bin/systemctl --user daemon-reload" in keepalive_service_text
assert (
"ExecStart=-/usr/bin/systemctl --user reset-failed awoooi-non110-runner.service"
in keepalive_service_text
)
assert "OnUnitInactiveSec=15s" in keepalive_timer.read_text(encoding="utf-8")
log = (tmp_path / "systemctl.log").read_text(encoding="utf-8")
assert "enable --now awoooi-non110-runner-autostart.path" not in log