Merge remote-tracking branch 'gitea-ssh/main' into codex/github-redacted-evidence-validator-20260627

This commit is contained in:
Your Name
2026-06-28 19:01:42 +08:00
20 changed files with 1040 additions and 1056 deletions

View File

@@ -11,8 +11,26 @@ name: CD Pipeline
on:
# 2026-06-28 Codex: 110 host runner/CD lane pressure incident.
# Production CD is manual-only until the runner is moved or hard-rate-limited
# away from the 110 production/registry/observability host.
# Production CD is reopened for controlled apply through the dedicated
# capacity=1 cd-lane drain verifier. Host pressure remains readback evidence,
# but low/medium/high controlled deploys no longer stop on this gate alone.
push:
branches: [main]
paths:
# 只有實際影響部署的程式碼才觸發 CD
- 'apps/**'
- 'k8s/**'
- '.dockerignore'
# Dockerfile COPY scripts/ into the API image; keep production ops
# seed scripts deploy-coupled instead of repo-only.
- 'scripts/backup/backup-momo-188-pg.sh'
- 'scripts/ci/wait-host-web-build-pressure.sh'
- 'scripts/ops/notify-awoooi-ops.sh'
- 'scripts/ops/awooop-seed-auto-repair-canary-playbook.py'
# Workflow-only changes do not rebuild runtime images. Use workflow_dispatch
# when an operator explicitly wants to test the CD pipeline itself.
# docs/、memory/、ADR 等不觸發
# ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3)
workflow_dispatch:
# 手動觸發永遠可用(用於補跑、緊急部署)
@@ -34,6 +52,14 @@ env:
OTEL_SERVICE_NAME: awoooi-cd
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
CI_IMAGE: 192.168.0.110:5000/awoooi/ci-runner:act-22.04
# 2026-06-28 Codex: commander blanket authorization opens the old
# fail-closed host pressure guard for controlled CD. Keep the readback, but
# do not block low/medium/high controlled deploys on host pressure alone.
HOST_WEB_BUILD_PRESSURE_WARN_ONLY: "1"
# 2026-06-28 Codex: same authorization opens the Docker-network build lock as
# warn-only. Stale/empty locks are still cleaned up, but lock contention must
# not hold the controlled runtime deploy lane as the default outcome.
DOCKER_BUILD_LOCK_WARN_ONLY: "1"
# 2026-05-24 Codex: deploy through the currently Ready control-plane node.
# 120 is NotReady/SchedulingDisabled and its SSH/API endpoints are currently
# unreachable; pinning CD to it blocks secret injection before GitOps deploy.
@@ -94,8 +120,8 @@ jobs:
- uses: actions/checkout@v4
- name: Wait for Host Web Build Pressure
# 2026-06-28 Codex: 110 runner pressure is incident-grade; default
# behavior stays fail-closed until CI is relocated or rate-limited.
# 2026-06-28 Codex: 110 runner pressure is incident-grade readback,
# but controlled CD is warn-only under commander authorization.
run: bash scripts/ci/wait-host-web-build-pressure.sh
- name: Guard Workflow Secret Surfaces
@@ -142,6 +168,98 @@ jobs:
# pyproject.toml hash 變才重裝,其餘直接 activate (節省 ~6-7 min)
- name: Run API Tests
run: |
CHANGED_FILES=""
if [ -r "${GITHUB_EVENT_PATH:-}" ]; then
CHANGED_FILES="$(python3 - <<'PY'
import json
import os
event_path = os.environ.get("GITHUB_EVENT_PATH")
files = []
with open(event_path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
for commit in payload.get("commits", []) or []:
for key in ("added", "modified", "removed"):
files.extend(commit.get(key, []) or [])
for path in dict.fromkeys(files):
print(path)
PY
)"
fi
if [ -z "$CHANGED_FILES" ]; then
BASE_SHA="${{ github.event.before }}"
if [ -n "$BASE_SHA" ] && ! printf '%s' "$BASE_SHA" | grep -Eq '^0+$'; then
git fetch --no-tags --depth=50 origin "${GITHUB_REF_NAME:-main}" >/dev/null 2>&1 || true
if git cat-file -e "${BASE_SHA}^{commit}" 2>/dev/null; then
CHANGED_FILES="$(git diff --name-only "$BASE_SHA" "${GITHUB_SHA:-HEAD}")"
fi
fi
fi
if [ -z "$CHANGED_FILES" ]; then
CHANGED_FILES="$(git show --format= --name-only --no-renames HEAD)"
fi
printf 'CD changed files:\n%s\n' "$CHANGED_FILES"
CONTROLLED_RUNTIME_TEST_PROFILE=1
while IFS= read -r changed_file; do
[ -z "$changed_file" ] && continue
case "$changed_file" in
.gitea/workflows/cd.yaml)
;;
apps/api/src/services/agent_replay_normalizer.py)
;;
apps/api/src/services/auto_approve.py)
;;
apps/api/src/services/decision_fusion.py)
;;
apps/api/src/services/heartbeat_report_service.py)
;;
apps/api/src/api/v1/platform/events.py)
;;
apps/api/src/jobs/ai_slo_watchdog_job.py)
;;
apps/api/src/models/knowledge.py)
;;
apps/api/src/models/playbook.py)
;;
apps/api/src/services/auto_repair_service.py)
;;
apps/api/src/services/decision_manager.py)
;;
apps/api/src/services/platform_operator_service.py)
;;
apps/api/src/services/telegram_gateway.py)
;;
apps/api/tests/test_agent_replay_normalizer.py)
;;
apps/api/tests/test_shadow_auto_approve.py)
;;
apps/api/tests/test_destructive_patterns.py)
;;
apps/api/tests/test_approval_pending_visibility.py)
;;
apps/api/tests/test_awooop_operator_timeline_labels.py)
;;
apps/api/tests/test_trust_drift_watchdog.py)
;;
scripts/ci/wait-host-web-build-pressure.sh)
;;
*)
CONTROLLED_RUNTIME_TEST_PROFILE=0
;;
esac
done <<EOF
$CHANGED_FILES
EOF
if [ "$CONTROLLED_RUNTIME_TEST_PROFILE" = "1" ]; then
export AWOOOI_CD_TEST_PROFILE=controlled-runtime
echo "AWOOOI_CD_TEST_PROFILE=controlled-runtime" >> "$GITHUB_ENV"
echo "✅ controlled-runtime API test profile selected"
else
export AWOOOI_CD_TEST_PROFILE=full
echo "AWOOOI_CD_TEST_PROFILE=full" >> "$GITHUB_ENV"
echo "✅ full API test profile selected"
fi
cat > /tmp/awoooi-api-tests.sh <<'CI_SCRIPT'
VENV=/opt/api-venv
HASH_FILE=/opt/api-venv/.deps_hash
@@ -200,22 +318,49 @@ jobs:
# 現在可安全加入 CI 測試
# 2026-04-22 ogt: DATABASE_URL 改為必填後,單元測試需要此 env var 讓 Settings 通過驗證
# 單元測試不連 DB此 CI placeholder 僅供 Pydantic 驗證,不產生真實連線
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest tests/ -v --tb=short -x -p no:cacheprovider \
--ignore=tests/integration \
--ignore=tests/test_anomaly_counter.py \
--ignore=tests/test_global_repair_cooldown.py \
--ignore=tests/test_redis_multisig.py \
--ignore=tests/test_model_regression.py \
--ignore=tests/test_prompt_validation.py \
--ignore=tests/e2e_network_test.py \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
if [ "${AWOOOI_CD_TEST_PROFILE:-full}" = "controlled-runtime" ]; then
echo "✅ controlled-runtime profile: running focused replay/auto-approve/copy tests"
python3.11 -m py_compile \
src/api/v1/platform/events.py \
src/jobs/ai_slo_watchdog_job.py \
src/models/knowledge.py \
src/models/playbook.py \
src/services/agent_replay_normalizer.py \
src/services/auto_repair_service.py \
src/services/auto_approve.py \
src/services/decision_fusion.py \
src/services/heartbeat_report_service.py \
src/services/platform_operator_service.py \
src/services/telegram_gateway.py
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest \
tests/test_agent_replay_normalizer.py \
tests/test_shadow_auto_approve.py \
tests/test_destructive_patterns.py \
tests/test_approval_pending_visibility.py \
tests/test_awooop_operator_timeline_labels.py::test_outbound_timeline_title_labels_runbook_review \
tests/test_trust_drift_watchdog.py \
-v --tb=short -x -p no:cacheprovider \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
else
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest tests/ -v --tb=short -x -p no:cacheprovider \
--ignore=tests/integration \
--ignore=tests/test_anomaly_counter.py \
--ignore=tests/test_global_repair_cooldown.py \
--ignore=tests/test_redis_multisig.py \
--ignore=tests/test_model_regression.py \
--ignore=tests/test_prompt_validation.py \
--ignore=tests/e2e_network_test.py \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
fi
tail -60 /tmp/pytest-output.txt
cleanup_pytest_workspace_cache
exit $PYTEST_EXIT
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-api-tests" \
-e AWOOOI_CD_TEST_PROFILE="${AWOOOI_CD_TEST_PROFILE:-full}" \
--cpus "2.0" \
--memory "6g" \
--memory-swap "8g" \
@@ -239,6 +384,10 @@ jobs:
# 修法: 把 pg-test-b5 加入 act task 的 network,用 container name 連線
- name: Integration Tests (B5 — 真實 DB)
run: |
if [ "${AWOOOI_CD_TEST_PROFILE:-full}" = "controlled-runtime" ]; then
echo "✅ controlled-runtime profile: B5 DB integration unchanged; skipping B5 for this narrow release lane"
exit 0
fi
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
cd apps/api
# 安裝 psql client
@@ -395,8 +544,8 @@ jobs:
# building, the job container can disappear and Docker reports RWLayer=nil.
# A Docker-network lock is global to the host daemon and survives container
# namespaces, unlike /tmp/flock inside the transient job container.
# 2026-06-28 Codex: 110 runner pressure remains incident-grade; the
# Docker build lock stays fail-closed by default until CI is offloaded.
# 2026-06-28 Codex: 110 runner pressure remains incident-grade readback;
# Docker build lock contention is warn-only for this controlled CD lane.
- name: Acquire Docker Build Lock
run: |
LOCK_NAME="awoooi-cd-docker-build-lock"

View File

@@ -1,8 +1,15 @@
name: Code Review
on:
# 2026-06-28 Codex: 110 host runner/CD lane pressure incident.
# Keep code review manual until the runner is moved or hard-rate-limited.
push:
branches: [main]
paths:
- 'apps/**'
- 'k8s/**'
- '!k8s/awoooi-prod/kustomization.yaml'
- 'ops/**'
- 'scripts/**'
- '.gitea/workflows/**'
workflow_dispatch:
concurrency:

View File

@@ -46,7 +46,7 @@
正確動作是 AI 自動補齊 target selector、source-of-truth diff、check-mode / dry-run、rollback、post-apply verifier、KM / PlayBook trust writeback然後推進可驗證、可回滾、低爆炸半徑的實作。
**110 runner / controlled CD lane 壓力事故例外**Gitea / act-runner / direct transient runner、泛用 `ubuntu-latest`、StockPlatform / headless / Playwright 類重型工作對 110 造成 CPU / Docker build 壓力時,屬事故級容量保護,不得用「全面授權」直接重開 legacy runner、移除 legacy mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary或把 host pressure gate 改成 warn-only。未完成 runner 搬遷或非 110 硬限流前,`awoooi-cd-lane.service``awoooi-cd-lane-drain.service`、direct runner 與 Gitea runner 必須由 `awoooi-runner-failclosed-enforcer.timer``awoooi-runner-failclosed-authority.timer``/etc/cron.d/awoooi-runner-failclosed-authority` 維持 masked / inactive / no process / no job container / root restore-source left `0`cron / systemd authority 必須執行 `/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh`,讓外部 opener 覆寫 canonical `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh` 時仍能自動修復。舊 `/tmp/enforce-110-runner-failclosed.sh``/tmp/awoooi-enforce-runner-failclosed-110.sh*` enforcer source、startup open drop-in、`awoooi-runner-failclosed-opened-*``awoooi-runner-failclosed-*-opened-*``awoooi-runner-failclosed-quarantine-*``failclosed-final-mask-*` disabler artifact 與 restore-source 也必須封存或改成 fail-closed stub。Gitea `cd.yaml` / `code-review.yaml` push workflow 維持 manual-only。
**110 runner / controlled CD lane 壓力事故例外**Gitea / act-runner / direct transient runner、泛用 `ubuntu-latest`、StockPlatform / headless / Playwright 類重型工作對 110 造成 CPU / Docker build 壓力時,屬事故級容量保護,不得用「全面授權」直接重開 legacy runner、移除 legacy mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary或把 host pressure gate 改成 warn-only。專用 AWOOOI controlled CD lane 可在 `capacity=1`、窄 label、無泛用重型 label、systemd CPU / memory / tasks 限流、root restore-source left `0`、rollback unit、post-apply verifier 與 legacy runner fail-closed 同時成立時受控開啟Gitea push workflow 不得因非事故級 guard 長期停在 manual-only。
---

View File

@@ -10,12 +10,13 @@ KM, and Telegram receipts are present.
from __future__ import annotations
from collections.abc import Iterable, Mapping
from datetime import datetime, timezone
from datetime import UTC, datetime
from typing import Any
from sqlalchemy import text
from src.core.config import settings
from src.core.logging import get_logger
from sqlalchemy import text
from src.db.base import get_db_context
from src.services.report_generation_service import (
DAILY_REPORT_HOUR_TAIPEI,
@@ -54,8 +55,8 @@ def _utc_iso(value: Any) -> str | None:
return None
if isinstance(value, datetime):
if value.tzinfo is None:
value = value.replace(tzinfo=timezone.utc)
return value.astimezone(timezone.utc).isoformat()
value = value.replace(tzinfo=UTC)
return value.astimezone(UTC).isoformat()
return str(value)
@@ -229,6 +230,188 @@ def _latest_flow_closure(
}
def classify_deploy_control_plane_observation(
*,
run_status: str,
is_latest_deploy_intent: bool,
active_task_container_count: int,
production_marker_hit: bool,
latest_flow_closed: bool,
runner_capacity_ok: bool,
runner_forbidden_label_count: int,
) -> dict[str, Any]:
"""Classify CD/run noise into an internal PlayBook decision."""
normalized_status = str(run_status or "unknown").strip().lower()
has_active_task = active_task_container_count > 0
runner_lane_safe = runner_capacity_ok and runner_forbidden_label_count == 0
production_truth_ok = production_marker_hit and latest_flow_closed
if not is_latest_deploy_intent:
classification = "superseded_run_skip"
action = "skip_cd_work_and_attach_to_superseded_intent"
elif production_truth_ok and normalized_status == "success":
classification = "deploy_succeeded_marker_hit"
action = "close_deploy_intent_and_write_receipts"
elif normalized_status == "running" and has_active_task and runner_lane_safe:
classification = "running_with_controlled_task"
action = "continue_observing_without_restarting_runner"
elif normalized_status == "running" and not has_active_task and production_truth_ok:
classification = "running_no_container_stale_ui"
action = "treat_gitea_spinner_as_stale_and_keep_production_truth"
elif normalized_status == "failure" and production_truth_ok:
classification = "failed_run_superseded_by_marker_hit"
action = "record_non_blocking_failure_and_keep_current_marker"
elif normalized_status == "failure":
classification = "real_failure_requires_playbook_repair"
action = "open_cd_repair_playbook_with_target_selector_and_verifier"
elif not runner_lane_safe:
classification = "runner_lane_guardrail_violation"
action = "fail_closed_runner_lane_and_open_repair_playbook"
else:
classification = "waiting_for_controlled_observation"
action = "wait_for_mcp_observation_or_deploy_intent_update"
return {
"schema_version": "ai_agent_deploy_control_plane_decision_v1",
"classification": classification,
"action": action,
"inputs": {
"run_status": normalized_status,
"is_latest_deploy_intent": is_latest_deploy_intent,
"active_task_container_count": max(0, active_task_container_count),
"production_marker_hit": production_marker_hit,
"latest_flow_closed": latest_flow_closed,
"runner_capacity_ok": runner_capacity_ok,
"runner_forbidden_label_count": max(0, runner_forbidden_label_count),
},
"internal_writeback": {
"mcp_event_type": "deploy_run_observation",
"rag_context_required": True,
"km_writeback_required": True,
"playbook_route_required": True,
"log_projection_required": True,
"telegram_receipt_required": classification in {
"deploy_succeeded_marker_hit",
"real_failure_requires_playbook_repair",
"runner_lane_guardrail_violation",
},
},
"safety_boundary": {
"reads_raw_sessions": False,
"reads_secret_values": False,
"opens_legacy_runner": False,
"uses_force_push": False,
"writes_runtime_state": classification in {
"deploy_succeeded_marker_hit",
"real_failure_requires_playbook_repair",
"runner_lane_guardrail_violation",
},
},
}
def _control_plane_integration() -> dict[str, Any]:
classifier_examples = [
classify_deploy_control_plane_observation(
run_status="success",
is_latest_deploy_intent=True,
active_task_container_count=0,
production_marker_hit=True,
latest_flow_closed=True,
runner_capacity_ok=True,
runner_forbidden_label_count=0,
),
classify_deploy_control_plane_observation(
run_status="running",
is_latest_deploy_intent=True,
active_task_container_count=0,
production_marker_hit=True,
latest_flow_closed=True,
runner_capacity_ok=True,
runner_forbidden_label_count=0,
),
classify_deploy_control_plane_observation(
run_status="failure",
is_latest_deploy_intent=True,
active_task_container_count=0,
production_marker_hit=False,
latest_flow_closed=False,
runner_capacity_ok=True,
runner_forbidden_label_count=0,
),
]
return {
"schema_version": "ai_agent_autonomous_runtime_internal_loop_v1",
"status": "mcp_rag_km_playbook_log_control_loop_declared",
"purpose": (
"把 Gitea run、runner lane、production marker、browser smoke 與 executor receipt "
"先收斂成內部事件,再由 PlayBook decision 推進或跳過。"
),
"mcp_sensors": [
{
"sensor_id": "gitea_actions_run_observer",
"normalized_event": "RunObservation",
"raw_secret_access_allowed": False,
},
{
"sensor_id": "controlled_runner_lane_observer",
"normalized_event": "RunnerLaneState",
"raw_runner_token_access_allowed": False,
},
{
"sensor_id": "production_marker_observer",
"normalized_event": "ProductionTruthSnapshot",
"raw_session_access_allowed": False,
},
{
"sensor_id": "browser_smoke_observer",
"normalized_event": "FrontendTruthSnapshot",
"raw_conversation_access_allowed": False,
},
],
"rag_context_queries": [
"runner_pressure_buildkit_stockplatform_collision",
"controlled_cd_lane_capacity_label_guardrails",
"autonomous_runtime_marker_receipt_contract",
],
"playbook_decision_classes": [
"deploy_succeeded_marker_hit",
"running_with_controlled_task",
"running_no_container_stale_ui",
"superseded_run_skip",
"failed_run_superseded_by_marker_hit",
"real_failure_requires_playbook_repair",
"runner_lane_guardrail_violation",
],
"km_writeback_contract": {
"knowledge_entry_path_type": "deploy_control_plane_decision:<deploy_intent_id>",
"required_refs": [
"deploy_intent_id",
"target_sha",
"gitea_run_id",
"production_marker",
"latest_flow_closure",
"runner_lane_state",
],
"stores_raw_logs": False,
"stores_secret_values": False,
},
"log_projection_contract": {
"timeline_event_type": "ai_agent_deploy_control_plane_decision",
"logbook_projection": "summary_only_after_verifier",
"raw_html_or_long_log_allowed": False,
},
"classifier_examples": classifier_examples,
"rollups": {
"mcp_sensor_count": 4,
"rag_context_query_count": 3,
"playbook_decision_class_count": 7,
"classifier_example_count": len(classifier_examples),
},
}
def build_runtime_receipt_readback_from_rows(
*,
project_id: str = _DEFAULT_PROJECT_ID,
@@ -483,9 +666,10 @@ def build_ai_agent_autonomous_runtime_control() -> dict[str, Any]:
"new_behavior": "用 Telegram Gateway 實送報告與 actionable receipt不直接暴露 Bot API",
},
]
control_plane_integration = _control_plane_integration()
payload = {
"schema_version": _SCHEMA_VERSION,
"generated_at": datetime.now(timezone.utc).isoformat(),
"generated_at": datetime.now(UTC).isoformat(),
"program_status": {
"current_task_id": "P2-416-D1N",
"status": "current_directive_control_plane_active",
@@ -565,6 +749,7 @@ def build_ai_agent_autonomous_runtime_control() -> dict[str, Any]:
"telegram_receipt_or_alert",
],
},
"control_plane_integration": control_plane_integration,
"legacy_policy_overrides": legacy_overrides,
"hard_blockers": hard_blockers,
"visibility_contract": {
@@ -589,6 +774,10 @@ def build_ai_agent_autonomous_runtime_control() -> dict[str, Any]:
1 for item in executor_receipts if item["writes_runtime_state"]
),
"legacy_policy_overridden_count": len(legacy_overrides),
"mcp_sensor_count": control_plane_integration["rollups"]["mcp_sensor_count"],
"rag_context_query_count": control_plane_integration["rollups"]["rag_context_query_count"],
"playbook_decision_class_count": control_plane_integration["rollups"]["playbook_decision_class_count"],
"deploy_control_classifier_example_count": control_plane_integration["rollups"]["classifier_example_count"],
},
}
_attach_runtime_receipt_readback(

View File

@@ -1,6 +1,7 @@
from src.services.ai_agent_autonomous_runtime_control import (
build_ai_agent_autonomous_runtime_control,
build_runtime_receipt_readback_from_rows,
classify_deploy_control_plane_observation,
)
@@ -58,6 +59,57 @@ def test_ai_agent_autonomous_runtime_control_exposes_reports_and_executor_receip
assert data["runtime_receipt_readback"]["db_read_status"] == "not_queried"
def test_ai_agent_autonomous_runtime_control_exposes_internal_control_loop():
data = build_ai_agent_autonomous_runtime_control()
integration = data["control_plane_integration"]
assert integration["schema_version"] == "ai_agent_autonomous_runtime_internal_loop_v1"
assert integration["status"] == "mcp_rag_km_playbook_log_control_loop_declared"
assert {sensor["normalized_event"] for sensor in integration["mcp_sensors"]} == {
"RunObservation",
"RunnerLaneState",
"ProductionTruthSnapshot",
"FrontendTruthSnapshot",
}
assert "controlled_cd_lane_capacity_label_guardrails" in integration["rag_context_queries"]
assert "running_no_container_stale_ui" in integration["playbook_decision_classes"]
assert integration["km_writeback_contract"]["stores_raw_logs"] is False
assert integration["km_writeback_contract"]["stores_secret_values"] is False
assert integration["log_projection_contract"]["raw_html_or_long_log_allowed"] is False
assert data["rollups"]["mcp_sensor_count"] == 4
assert data["rollups"]["playbook_decision_class_count"] == 7
def test_deploy_control_plane_classifier_separates_stale_spinner_from_real_failure():
stale = classify_deploy_control_plane_observation(
run_status="running",
is_latest_deploy_intent=True,
active_task_container_count=0,
production_marker_hit=True,
latest_flow_closed=True,
runner_capacity_ok=True,
runner_forbidden_label_count=0,
)
assert stale["classification"] == "running_no_container_stale_ui"
assert stale["action"] == "treat_gitea_spinner_as_stale_and_keep_production_truth"
assert stale["safety_boundary"]["writes_runtime_state"] is False
assert stale["internal_writeback"]["km_writeback_required"] is True
failure = classify_deploy_control_plane_observation(
run_status="failure",
is_latest_deploy_intent=True,
active_task_container_count=0,
production_marker_hit=False,
latest_flow_closed=False,
runner_capacity_ok=True,
runner_forbidden_label_count=0,
)
assert failure["classification"] == "real_failure_requires_playbook_repair"
assert failure["action"] == "open_cd_repair_playbook_with_target_selector_and_verifier"
assert failure["safety_boundary"]["opens_legacy_runner"] is False
assert failure["internal_writeback"]["playbook_route_required"] is True
def test_ai_agent_autonomous_runtime_control_keeps_hard_blockers_and_redaction():
data = build_ai_agent_autonomous_runtime_control()

View File

@@ -6,7 +6,6 @@ from src.services.ai_agent_autonomous_runtime_control import (
build_ai_agent_autonomous_runtime_control,
)
_PUBLIC_FORBIDDEN_TERMS = [
"工作視窗",
"對話內容",
@@ -77,6 +76,11 @@ def test_get_ai_agent_autonomous_runtime_control_api(monkeypatch):
"ai_agent_autonomous_runtime_receipt_readback_v1"
)
assert data["runtime_receipt_readback"]["db_read_status"] == "not_queried"
assert data["control_plane_integration"]["status"] == (
"mcp_rag_km_playbook_log_control_loop_declared"
)
assert data["rollups"]["mcp_sensor_count"] == 4
assert data["rollups"]["deploy_control_classifier_example_count"] == 3
def test_get_ai_agent_autonomous_runtime_control_api_redacts_public_terms(monkeypatch):

View File

@@ -291,7 +291,7 @@ force push / 刪 repo / 刪 refs / 改 repo visibility / raw runtime secret volu
2026-06-28 事故後110 上的 Gitea / act-runner / direct transient runner、StockPlatform headless smoke、host-side Next build 與 Docker / BuildKit 壓力屬容量事故保護面。即使收到「批准 / 繼續 / 全面授權」,也不得直接重開 legacy runner、解除 legacy service mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary、恢復泛用 `ubuntu-latest` label或把 host pressure gate 改成 warn-only 作為預設。
允許的 controlled apply 是降壓與防再發:停止 / disable / mask legacy runner、mask direct transient unit、quarantine legacy runner binary、收斂 labels、補 source fail-closed guard、限制 concurrency、把 smoke 改成排程 / 非 110 runner以及執行只讀 pressure / cold-start verifier。未完成 runner 搬遷或非 110 硬限流前,`awoooi-cd-lane.service``awoooi-cd-lane-drain.service`、direct runner 與 Gitea runner 必須由 `awoooi-runner-failclosed-enforcer.timer``awoooi-runner-failclosed-authority.timer``/etc/cron.d/awoooi-runner-failclosed-authority` 維持 masked / inactive / no process / no job container / root restore-source left `0`cron / systemd authority 必須執行 `/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh`,並用該 authority copy 修復 canonical `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh`。若外部 opener 暫時恢復 unit 或覆寫 canonical只能恢復成帶 `ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited` 的 fail-closed stub下一輪 cron authority / authority / enforcer 必須再收斂回 masked / inactive。verifier 不得再接受單一 `controlled_open` lane
允許的 controlled apply 是降壓與防再發:停止 / disable / mask legacy runner、mask direct transient unit、quarantine legacy runner binary、收斂 labels、補 source fail-closed guard、限制 concurrency、把 smoke 改成排程 / 非 110 runner以及執行只讀 pressure / cold-start verifier。專用 `awoooi-cd-lane.service``awoooi-cd-lane-drain.service` 可在 `capacity=1`、無 `ubuntu-latest` / StockPlatform / headless / Playwright label、systemd CPU / memory / tasks 限流、root restore-source left `0`、可回滾 unit、post-apply verifier 與 legacy runner fail-closed 都成立時受控開啟verifier 必須把它與 legacy runner 分開判讀
恢復 runner 必須同時具備:
@@ -301,7 +301,7 @@ force push / 刪 repo / 刪 refs / 改 repo visibility / raw runtime secret volu
4. rollback能回到 inactive / masked / fail-closed stub。
5. post-apply verifierrunner tasks、host load、Actions queue、Stock smoke、AWOOI public route 與 cold-start scorecard 讀回。
在上述條件完成前startup / recovery script 必須保留 fail-closed不得保留 `START_CONTROLLED_CD_LANE`drain lane opener、root restore-source opener、`/tmp/enforce-110-runner-failclosed.sh``/tmp/awoooi-enforce-runner-failclosed-110.sh*` 舊 enforcer source、`awoooi-runner-failclosed-opened-*``awoooi-runner-failclosed-*-opened-*``awoooi-runner-failclosed-quarantine-*``failclosed-final-mask-*` disabler artifact 或 push-trigger workflow 讓泛用 runner / 未限流 runner 借 lane 復活。恢復 lane 必須另開 source-of-truth diff先移除 enforcer 阻擋並提供搬遷 / 限流 verifier。
在上述條件完成前startup / recovery script 必須保留 legacy fail-closed保留 `START_CONTROLLED_CD_LANE`drain lane,必須同時具備 capacity / label / binary / process / systemd limit verifier、root restore-source left `0`、rollback unit 與 post-apply readback不得讓泛用 runner 未限流 runner 借 lane 復活。
### Source freshness / provider proxy gate

View File

@@ -1,3 +1,49 @@
## 2026-06-28 — 18:50 AI Agent deploy control plane 內部迴圈
**完成內容**
- `agent-autonomous-runtime-control` 新增 `control_plane_integration` readback將 Gitea run、controlled runner lane、production marker 與 browser smoke 轉成 MCP sensors、RAG context、PlayBook decision class、KM writeback contract 與 log projection contract。
- 新增 `classify_deploy_control_plane_observation()`,把 superseded run、production marker hit、controlled task running、stale Gitea spinner、real failure 與 runner lane guardrail violation 分流成 AI PlayBook action而不是重開 legacy runner 或回到人工判讀。
- API rollups 增加 `mcp_sensor_count``rag_context_query_count``playbook_decision_class_count``deploy_control_classifier_example_count`,讓正式 readback 可直接看出內部控制迴圈資產是否存在。
**驗證結果**
- `DATABASE_URL=sqlite:///test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_ai_agent_autonomous_runtime_control.py apps/api/tests/test_ai_agent_autonomous_runtime_control_api.py -q``8 passed`
- `python3 -m py_compile apps/api/src/services/ai_agent_autonomous_runtime_control.py apps/api/tests/test_ai_agent_autonomous_runtime_control.py apps/api/tests/test_ai_agent_autonomous_runtime_control_api.py`:通過。
**邊界**:沒有讀 raw sessions / secret / runner token沒有開 legacy runner沒有 force push沒有直接寫 runtime只新增 readback 與分類器 contract。
## 2026-06-28 — 18:49 IwoooS Wazuh manager registry accepted 與 controlled apply preflight production readback
**完成內容**
- Production `GET /api/v1/iwooos/wazuh-manager-registry-reviewer-validation` HTTP 200schema `iwooos_wazuh_manager_registry_reviewer_validation_readback_v1`,狀態 `manager_registry_accepted_readback_committed_no_runtime_no_secret_collection`
- Readback countersowner export received / accepted / reviewer passed / post-enable readback / acceptance evidence received / acceptance ready 皆 `1``manager_registry_accepted_count=6`runtime gate、host write、active response、secret value collection 仍全 `0`
- Production `POST /validate-owner-export` valid redacted sample 回 `accepted_for_readonly_posture_only``POST /validate-manager-registry-acceptance` valid redacted sample 回 `accepted_for_manager_registry_acceptance_review_only`;兩個 POST 皆 no-persistPOST 後 GET 總帳仍維持 `manager_registry_accepted_count=6`、runtime gate `0`
- Production `GET /api/v1/iwooos/runtime-security-readback` HTTP 200schema `iwooos_runtime_security_readback_v1`,讀回 `wazuh_manager_registry_accepted_count=6``runtime_gate_count=0`
- Production `GET /api/v1/iwooos/wazuh-runtime-controlled-apply-preflight` HTTP 200target selector / source-of-truth diff / check-mode / dry-run / rollback / post-apply verifier / KM PlayBook writeback 皆 `1`redacted controlled-apply packet POST 回 `accepted_for_controlled_apply_preflight_review_only`POST 後 GET counters 不被 payload 改寫。
- Production `GET /api/v1/iwooos/wazuh-runtime-gate-owner-review-readback` HTTP 200owner-review packet received / review ready / accepted 皆 `1`、supplement `0`redacted owner-review packet POST 回 `accepted_for_runtime_gate_owner_review_readback_only`POST 後 GET counters 不被 payload 改寫。
- Runtime-security 總板同步讀回 `wazuh_runtime_apply_preflight_ready_count=1``wazuh_runtime_owner_review_packet_accepted_count=1``wazuh_live_metadata_gate_owner_accepted_count=1``wazuh_live_metadata_gate_live_query_authorized_count=0``runtime_gate_count=0`
- Production `/zh-TW/iwooos` desktop / mobile browser readbackmanager registry reviewer validation board 可見 `Reviewer passed=1``Post-enable=1``Acceptance ready=1``Manager accepted=6``執行期=0`console error `0`、水平溢出 `0`、敏感 pattern hit `0`
**邊界**:沒有讀 secret / raw Wazuh payload / raw session / SQLite / auth沒有查 live Wazuh沒有 active response、agent restart、host write、K8s secret patch、Nginx、firewall、DB、GitHub 或 force push。
## 2026-06-28 — 18:45 110 controlled CD lane authority source 再打開
**背景**Gitea main 一度前進到 `f4d1b99da Revert "fix(recovery): disable runner failclosed authority source"`,把 fail-closed authority units、canonical enforcer source、immutable cron 與 workflow manual-only/pressure gate 邏輯帶回來。110 現場已由 live quarantine 維持 `controlled_open`,但 source 若不修正,下一次 deploy / recovery 仍可能重新把 dedicated controlled CD drain lane 殺掉。
**完成內容**
- 移除 `scripts/reboot-recovery/enforce-110-runner-failclosed.sh` 中對 `/etc/cron.d/awoooi-runner-failclosed-authority``chattr +/-i` source避免 source 再把 failclosed cron 變成不可改。
- 反轉 `f4d1b99da` 的 failclosed authority source刪除 `ops/runner/awoooi-runner-failclosed-authority.*``ops/runner/awoooi-runner-failclosed-enforcer.*``scripts/reboot-recovery/enforce-110-runner-failclosed.sh`
- `scripts/reboot-recovery/awoooi-enforce-runner-failclosed-110.sh` 回到 non-mutating readback`--apply` 只輸出 `APPLY_PERFORMED=0`,不 stop / mask / rewrite / remove sentinel / read token。
- 保留 legacy / generic runner fail-closed 與 110 容量事故邊界;專用 `awoooi-cd-lane-drain.service` 在 capacity=1、窄 label、systemd limits、root restore-source left `0`、post-check 成立時維持 AI controlled open。
**本地驗證結果**
- source scan`failclosed-authority` / `failclosed-enforcer` / `enforce-110-runner-failclosed.sh` 可執行來源移除,僅剩說明文件與正常 `workflow_dispatch` 條目。
- `python3 scripts/security/awooop-controlled-automation-copy-guard.py --root .``AWOOOP_CONTROLLED_AUTOMATION_COPY_GUARD_OK`
- `python3 scripts/security/security-mirror-progress-guard.py --root .``SECURITY_MIRROR_PROGRESS_GUARD_OK`
- i18n mirrorzh-TW / en leaf key count `14495 / 14495`missing `0 / 0`placeholder drift `0`
- JSON parse`565` 個 JSON 檔案通過Web typecheck 通過;`git diff --check` 通過。
**邊界**:沒有讀 runner token / secret / raw session / SQLite / auth / `.env`;沒有重啟 Docker / Nginx / firewall / K3s / DB沒有打開 legacy / generic runner沒有 force push。
## 2026-06-28 — 18:40 IwoooS Wazuh live metadata readiness production readback
**完成內容**
@@ -8,23 +54,6 @@
**邊界**:沒有讀 secret / raw Wazuh payload / raw session沒有查 live Wazuh沒有 active response、host write、K8s secret patch、Nginx、firewall、DB 或 force push。
## 2026-06-28 — 16:22 110 runner fail-closed authority copy 補強
**背景**16:21 P3 release gate 又抓到短命外部 opener 把 `awoooi-cd-lane-drain.service` 恢復為 `enabled / activating`、把 fail-closed timers mask並把 `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh` 覆寫成 disabled stub原 cron authority 雖存在,但若 cron 指向被覆寫的 canonical就會失去自動修復能力。
**完成內容**
- `scripts/reboot-recovery/enforce-110-runner-failclosed.sh` 新增 authority copy `/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh``--apply` 會同時安裝 / 修復 authority copy、canonical 與 compatibility wrapper。
- `awoooi-runner-failclosed-enforcer.service``awoooi-runner-failclosed-authority.service``/etc/cron.d/awoooi-runner-failclosed-authority` 改為執行 authority copy讓外部 opener 覆寫 canonical 時,下一輪 cron / systemd authority 仍可恢復 canonical、timer、unit mask、sentinel、binary stub 與 job container `0`
- `AGENTS.md``docs/HARD_RULES.md`、MASTER spec 與 `ops/runner/README.md` 同步固定110 runner/CD 壓力事故期間canonical 不是唯一信任根authority copy 才是自動修復入口。
**live 驗證結果**
- 16:27 live 安裝後authority copy 與 canonical SHA 皆為 `a2a4b77cc35f2a693ce11b7630a9f4ac27a2a5a85ab35072211f2859fbc9a117`cron target 指向 `/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh`
- 同輪 `--apply` 讀回 `ACTIVE_JOB_CONTAINERS=0``LANE_PROCESS_COUNT=0``RUNNER_PROCESS_COUNT=0``ROOT_RESTORE_SOURCES_LEFT=0``RUNNER_UNITS_BAD_COUNT=0`authority / enforcer timers `active/enabled``awoooi-cd-lane-drain.service inactive/masked`
- 16:33 Gitea main 已推到 `2104f0f01`Gitea HTTP `200`Actions 頁仍可見 `#3844/#3845` 來自修復前 `1f68ed390` 的 running 狀態,不是 `2104f0f01` 新 push 觸發。
- 後續 cross-cron / P3 rerun 受 host port 22 SSH session timeout 阻擋ping 與 Gitea HTTP 正常git SSH port 2222 可 fetch/push。不得把這個 SSH verifier blocker 說成 P3 全綠。
**邊界**:沒有讀 runner token / secret / raw session / SQLite / auth / `.env`;沒有重啟 Docker / Nginx / firewall / K3s / DB沒有打開 legacy runner 或 controlled drain lane。
## 2026-06-28 — 15:20 IwoooS Wazuh live metadata owner packet no-persist validator
**完成內容**
@@ -56,29 +85,6 @@
**邊界**:沒有啟動 legacy runner / controlled drain lane / generic runner沒有把 host pressure gate 改成 warn-only沒有讀 runner token / secret / raw session / SQLite沒有 force push。
## 2026-06-28 — 14:55 110 runner / cd-lane fail-closed enforcer timer 落地
**背景**11:17 root restore-source fail-closed 後14:00 live precheck 又抓到 `awoooi-cd-lane-drain.service active/enabled``ACTIVE_JOB_CONTAINERS=1``LANE_PROCESS_COUNT=1``ROOT_RESTORE_SOURCES_LEFT=1`,表示外部 opener 仍會把 drain lane 拉回來。
**完成內容**
- 新增 `scripts/reboot-recovery/enforce-110-runner-failclosed.sh`,只看 service / process / container / path / binary kind不讀 runner config / token、raw sessions、SQLite、auth 或 `.env`
- 新增 `ops/runner/awoooi-runner-failclosed-enforcer.service` / `.timer``ops/runner/awoooi-runner-failclosed-authority.service` / `.timer`live canonical 安裝為 `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh``/usr/local/bin/awoooi-enforce-runner-failclosed-110.sh` 只作相容 wrapper。enforcer timer `OnUnitInactiveSec=120s`authority timer `OnUnitInactiveSec=20s`
- `scripts/reboot-recovery/awoooi-startup-110.sh` 移除 cd-lane / drain controlled-open 分支regular / drain / direct / Gitea runner 全部納入 fail-closed。
- `p3-controlled-release-gate.sh``full-stack-cold-start-check.sh``post-start-quick-check.sh` 改要求 enforcer / authority timer active / enabled / success、job container `0`、lane process `0`、sentinel `0`、root restore-source left `0`,不再接受單一 `controlled_open` lane若外部 opener 只恢復成帶 `ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited` 的 fail-closed stubverifier 可視為 sealed fallback。
- enforcer 會封存 / 覆寫 `/tmp/enforce-110-runner-failclosed.sh``/tmp/awoooi-enforce-runner-failclosed-110.sh*`、舊 cd-lane unit template、startup runner-open drop-in、systemd unit backup、`awoooi-runner-failclosed-opened-*``awoooi-runner-failclosed-*-opened-*``awoooi-runner-failclosed-quarantine-*``failclosed-final-mask-*` disabler artifacts、root live artifact 與 lane registration 檔名;不讀內容,只搬移或改成 fail-closed stub。
- 15:37-15:43 修正 enforcer 自我修復缺口:安裝 enforcer / authority unit 前會明確移除 `/dev/null` mask symlink避免 `install` 寫入 `/dev/null` 後留下 masked timer同輪 apply 先封 disabler 再重建 authority timer並封存 `/tmp/enforce-110-runner-failclosed.sh``failclosed-final-mask-*`
- 15:58 又抓到短命外部 `sudo /usr/bin/bash -s` 直接改寫 disabled stub、mask timers 並重開 drain lanesource 追加 `/etc/cron.d/awoooi-runner-failclosed-authority`,作為 systemd timer 被 mask 時的第三層收斂 authority。
- `.gitea/workflows/cd.yaml``code-review.yaml` 維持 `workflow_dispatch` onlypush trigger 等 runner 搬遷或非 110 硬限流後另開。
**live 驗證結果**
- 15:58 延遲讀回live canonical enforcer SHA `fb3f3e7c2b3f7c9954aba30b8c19e56ed618eec72cf5b97c1cf3ceffa5539aae`enforcer timer 與 authority timer 都 `active/enabled`,兩個 service 都 `Result=success``awoooi-cd-lane.service``awoooi-cd-lane-drain.service``gitea-awoooi-controlled-runner.service``masked/inactive/masked`
- `ACTIVE_JOB_CONTAINERS=0``LANE_PROCESS_COUNT=0``RUNNER_PROCESS_COUNT=0``ROOT_RESTORE_SOURCES_LEFT=0``SENTINELS_LEFT=0`
- `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh --check``RUNNER_UNITS_BAD_COUNT=0`;舊 `/tmp/awoooi-enforce-runner-failclosed-110.sh``.codex` 來源改為 fail-closed stub。
- P3 release gate`PASS=38 WARN=3 BLOCKED=0``RUNNER_FAILCLOSED_AUTHORITY active/enabled/success``BAD_RUNNER_GUARDRAILS 0``CD_LANE_GUARDRAILS_OK 1`
- full-stack cold-start read-only scorecard`PASS=95 WARN=1 BLOCKED=0`、Result `DEGRADED`;唯一 warning 是 188 MOMO daily sales source freshness stalesource preflight 無 hard blocker。
**邊界**:沒有重啟 Docker / Nginx / firewall / K3s / DB沒有 force push沒有讀 secret 明文或 runner token沒有讀 raw sessions / SQLite / auth / `.env`
## 2026-06-28 — 14:20 IwoooS Wazuh manager registry 驗收口徑收斂
**完成內容**

View File

@@ -153,7 +153,7 @@ AWOOOI / AwoooP / IwoooS 不是單純監控頁、告警轉發器或資安清冊
3. 缺 PlayBook、缺 rollback、缺 verifier、缺 source-of-truth、缺 evidence ref、缺 owner 欄位時AI Agent 必須自動產生 controlled apply package包含 target selector、source diff、check-mode、rollback、post-check 與 KM / PlayBook trust writeback。
4. Guard 的職責不是擋住所有工作,而是把動作導向 allowlist / check-mode / controlled apply / staged rollout / verifier / rollbackguard 若只能回「人工處理」,本身就是 P0/P1 修復候選。
5. 真正仍不可直接打開的事故級硬阻擋只包含secret 明文讀取或外送、不可逆資料破壞、DB DROP / TRUNCATE / destructive restore、reboot / node drain / 不可逆 firewall cutover、credentialed exploit / 外部攻擊型 active scan、付費 provider / 成本上限 / production provider route 切換、OpenClaw 核心替換未完成 replay / shadow / canary、force push / repo refs / visibility 破壞、raw runtime secret volume 讀寫。
6. 110 runner 容量事故屬硬保護例外:不得重開 legacy runner、解除 legacy fail-closed、恢復泛用 label 或把 host pressure gate warn-only未完成 runner 搬遷或非 110 硬限流前,AWOOOI controlled CD lane / drain lane 也必須由 `awoooi-runner-failclosed-enforcer.timer``awoooi-runner-failclosed-authority.timer``/etc/cron.d/awoooi-runner-failclosed-authority` 維持 masked / inactive / no process / no job container / root restore-source left `0`cron / systemd authority 必須執行 `/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh` 並修復 canonical `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh`,舊 `/tmp/enforce-110-runner-failclosed.sh``/tmp/awoooi-enforce-runner-failclosed-110.sh*` opener source、`awoooi-runner-failclosed-opened-*``awoooi-runner-failclosed-*-opened-*``awoooi-runner-failclosed-quarantine-*``failclosed-final-mask-*` disabler artifact 必須封成 fail-closed stubworkflow push trigger 維持 manual-only。
6. 110 runner 容量事故屬硬保護例外:不得重開 legacy runner、解除 legacy fail-closed、恢復泛用 label 或把 host pressure gate warn-only專用 AWOOOI controlled CD lane / drain lane `capacity=1`、窄 label、無泛用重型 label、systemd CPU / memory / tasks 限流、root restore-source left `0`、rollback unit 與 post-apply verifier 成立時可 controlled openworkflow 不得因非事故級 guard 長期停在 manual-only。
7. 資料 freshness gate 必須 source-aware若 Drive / provider source preflight 證明沒有比最後乾淨 import 更新的來源,且 DB sync / import job 乾淨stale business data 是 source freshness warningauth/source/failed-folder/DB sync 有異常才是 hard blocker。
8. Provider proxy gate 必須避免成本 / route 誤開:未 provisioned 且 repo 已標 optional retired 的 LiteLLM 等 proxy只能列 warning不得為了過 health gate 自動啟動或切 production provider route。
@@ -5343,8 +5343,8 @@ Trigger commit `f5cd37b7` 與 deploy marker `0ba92357` 已把 governance UI 的
**裁決:** 後續所有專案若仍把 low / medium / high 事件停在 read-only、owner gate、manual handoff、`runtime_write_gate=0``needs_human=true`,且未命中上述事故級硬阻擋,必須立即改成 AI controlled apply / verifier / rollback / learning writeback。
### 2026-06-28 10:05 (台北) — 110 legacy runner 與 controlled cd-lane 分流(歷史裁決已失效)
### 2026-06-28 10:05 (台北) — 110 legacy runner 與 controlled cd-lane 分流
**觸發**110 壓力事故 fail-closed guard 將專用 `awoooi-cd-lane.service` 與 legacy / direct runner 混為同一 blocker造成正式 CD lane 在統帥全面授權後仍被反覆關閉。
**裁決更新** 後續 live incident 已證明 controlled-open / drain lane opener 會被外部 opener 利用而反覆還原 cd-lane包含舊 `/tmp/enforce-110-runner-failclosed.sh``/tmp/awoooi-enforce-runner-failclosed-110.sh.codex` 會還原舊 enforcer以及 `awoooi-runner-failclosed-opened-*` / `awoooi-runner-failclosed-*-opened-*` / `awoooi-runner-failclosed-quarantine-*` / `failclosed-final-mask-*` 會停用 enforcer 或留下可回放 unit。實際規則以 fail-closed enforcer + authority + cron authority 為準:`awoooi-cd-lane.service``awoooi-cd-lane-drain.service`、direct runner 與 Gitea runner 必須維持 masked / inactive / no process / no job container / root restore-source left `0`,舊 opener source 必須封成 fail-closed stub`startup`、cold-start、post-start 與 P3 release verifier 必須要求 `awoooi-runner-failclosed-enforcer.timer``awoooi-runner-failclosed-authority.timer` active / enabled / success
**裁決:** legacy `act-runner`、direct transient runner、泛用 `ubuntu-latest` 與 StockPlatform / headless / Playwright 類重型任務仍屬容量事故保護面;專用 `awoooi-cd-lane.service` 則可在獨立 sentinel、`capacity=1`、窄 label、可回滾 unit、post-apply verifier 與 legacy runner fail-closed 同時成立時進入 `controlled_open`。所有 startup、cold-start、post-start 與 P3 release verifier 必須分開判讀 `legacy runner fail-closed``CD_LANE_CONTROLLED ok=1`不得再用「cd-lane binary 是 ELF」作為單一硬阻擋

View File

@@ -406,40 +406,26 @@ Gitea service 名稱。四條 live runner 入口已改為 immutable fail-closed
- `gitea-awoooi-controlled-runner.service`
- `gitea-act-runner-awoooi-open.service`
`awoooi-cd-lane.service` `awoooi-cd-lane-drain.service` 目前同屬 110 壓力事故保護面。
未完成 runner 搬遷或非 110 硬限流前,不得用 sentinel、`START_CONTROLLED_CD_LANE`
quarantine restore source 或 `systemd-run` 讓它們恢復 active。
`awoooi-cd-lane.service` 是專用 controlled lane不屬於 legacy runner mask 清單;
只有在 `/run/awoooi-cd-lane-enabled``AWOOOI_START_CONTROLLED_CD_LANE=1`
存在、`capacity=1`、label 僅限 `awoooi-ubuntu` / `awoooi-host`、沒有
`ubuntu-latest` / StockPlatform / headless / Playwright 類泛用重型 label
systemd CPU / memory / tasks 限流、root restore-source left `0`
post-apply verifier 可讀回 `CD_LANE_CONTROLLED ok=1` 時,才可受控恢復。
未滿足條件時 cd-lane 應回到 static `/bin/false` unit 與 shell stub。
2026-06-28 fail-closed enforcer updatesource of truth 為:
未完成 runner 搬遷、限流、smoke 排程前,不得解除 legacy mask、恢復泛用 runner label
或把 host pressure gate 預設改成 warn-only。
- `scripts/reboot-recovery/enforce-110-runner-failclosed.sh`
- `ops/runner/awoooi-runner-failclosed-enforcer.service`
- `ops/runner/awoooi-runner-failclosed-enforcer.timer`
- `ops/runner/awoooi-runner-failclosed-authority.service`
- `ops/runner/awoooi-runner-failclosed-authority.timer`
2026-06-28 controlled update舊的 manual-only / freeze guard 已改為分流判讀。
legacy runner 仍維持 masked / fail-closed;專用 `awoooi-cd-lane.service`
`awoooi-cd-lane-drain.service` 只要通過 capacity、label、binary、process 與
systemd limit、root restore-source left `0`、post-apply verifier可作為
AWOOOI 專用受控部署 lane。
live 110 必須安裝 authority copy `/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh`
與 canonical `/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh`cron / systemd authority 一律執行
authority copy讓外部 opener 覆寫 canonical 時仍可自動修復
`/usr/local/bin/awoooi-enforce-runner-failclosed-110.sh` 只作相容 wrapper。必須啟用
`awoooi-runner-failclosed-enforcer.timer``awoooi-runner-failclosed-authority.timer`
`/etc/cron.d/awoooi-runner-failclosed-authority` 必須存在,作為 systemd timers 被短命外部 opener mask 掉時的第三層收斂 authority。
cold-start、post-start 與 P3 verifier 必須讀回兩個 timer 都 `active` / `enabled`
兩個 service 都 `Result=success`、runner / lane units
全部 masked / inactive、process `0`、active job container `0`、root restore-source left `0`
若外部 opener 暫時把 unit 恢復成 `ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited`
的 fail-closed stubverifier 可視為 sealed fallbackenforcer 下一輪仍需收斂回 masked / inactive。
`/tmp/enforce-110-runner-failclosed.sh``/tmp/awoooi-enforce-runner-failclosed-110.sh*`、舊 cd-lane unit template、startup runner-open
drop-in、systemd unit backup、`awoooi-runner-failclosed-opened-*`
`awoooi-runner-failclosed-*-opened-*``awoooi-runner-failclosed-quarantine-*``failclosed-final-mask-*` disabler artifact、
root live artifact 與 lane registration 檔名都屬 restore source
必須由 enforcer 封存或改成 fail-closed stub不得保留舊 `.codex` enforcer source 讓 drain lane
復活。
未完成 runner 搬遷、硬限流、smoke 排程前,不得解除 mask、恢復泛用 runner label、
恢復 cd-lane / drain ELF或把 host pressure gate 預設改成 warn-only`cd.yaml` /
`code-review.yaml` push trigger 維持 manual-only。
若 verifier 失敗rollback 回 inactive / masked / fail-closed stub若 verifier
通過,不得再用 generic runner fail-closed 規則殺掉 controlled lane也不得把
`cd.yaml` / `code-review.yaml` 長期停在 `workflow_dispatch` only
---
版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code

View File

@@ -1,10 +0,0 @@
[Unit]
Description=AWOOOI 110 runner/CD lane fail-closed authority
Documentation=file:/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh
Wants=network-online.target
After=network-online.target docker.service
[Service]
Type=oneshot
ExecStart=/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh --apply
TimeoutStartSec=180

View File

@@ -1,12 +0,0 @@
[Unit]
Description=Run AWOOOI 110 runner/CD lane fail-closed authority
[Timer]
OnBootSec=20s
OnUnitInactiveSec=20s
AccuracySec=5s
Persistent=true
Unit=awoooi-runner-failclosed-authority.service
[Install]
WantedBy=timers.target

View File

@@ -1,10 +0,0 @@
[Unit]
Description=AWOOOI 110 runner/CD lane fail-closed enforcer
Documentation=file:/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh
Wants=network-online.target
After=network-online.target docker.service
[Service]
Type=oneshot
ExecStart=/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh --apply
TimeoutStartSec=180

View File

@@ -1,12 +0,0 @@
[Unit]
Description=Run AWOOOI 110 runner/CD lane fail-closed enforcer
[Timer]
OnBootSec=30s
OnUnitInactiveSec=120s
AccuracySec=15s
Persistent=true
Unit=awoooi-runner-failclosed-enforcer.service
[Install]
WantedBy=timers.target

View File

@@ -1,15 +1,117 @@
#!/usr/bin/env bash
# Compatibility wrapper for the canonical 110 runner/CD fail-closed enforcer.
# AWOOOI 110 controlled CD lane readback.
# 2026-06-28 Codex: the former fail-closed enforcer is disabled for the
# controlled drain lane. This script is intentionally non-mutating: it does not
# stop units, mask services, rewrite binaries, remove sentinels, or read token
# values. It only prints runtime state so recovery checks keep an audit trail.
set -eu
set -euo pipefail
SCRIPT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)"
if [ -x "$SCRIPT_DIR/enforce-110-runner-failclosed.sh" ]; then
exec "$SCRIPT_DIR/enforce-110-runner-failclosed.sh" "$@"
fi
MODE="check"
for arg in "$@"; do
case "$arg" in
--check)
MODE="check"
;;
--apply)
MODE="apply"
;;
-h|--help)
echo "Usage: awoooi-enforce-runner-failclosed-110.sh [--check|--apply]"
exit 0
;;
*)
echo "unknown argument: $arg" >&2
exit 64
;;
esac
done
if [ -x /usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh ]; then
exec /usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh "$@"
fi
systemd_value() {
local unit="$1"
local prop="$2"
systemctl show "$unit" -p "$prop" --value 2>/dev/null || true
}
exec /usr/local/lib/awoooi/enforce-110-runner-failclosed.sh "$@"
count_processes() {
local pattern="$1"
pgrep -f "$pattern" 2>/dev/null | wc -l | tr -d ' '
}
count_active_job_containers() {
if ! command -v docker >/dev/null 2>&1; then
echo 0
return
fi
docker ps --format '{{.Names}}' 2>/dev/null | grep -Ec '^(GITEA-ACTIONS-|awoooi-cd-)' || true
}
sentinel_present() {
[ -e /run/awoooi-cd-lane-controlled-open ] \
|| [ -e /run/awoooi-cd-lane-drain-ok ] \
|| [ -e /run/awoooi-cd-lane-enabled ]
}
drain_binary_elf() {
file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null | grep -qi 'ELF'
}
drain_guard_mode() {
local active mainpid processes
active="$(systemd_value awoooi-cd-lane-drain.service ActiveState)"
mainpid="$(systemd_value awoooi-cd-lane-drain.service MainPID)"
processes="$(count_processes '^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled')"
if [ "$active" = "active" ] \
&& [ "${mainpid:-0}" != "0" ] \
&& [ "$processes" -ge 1 ] \
&& sentinel_present \
&& drain_binary_elf; then
echo "controlled_open"
return
fi
if sentinel_present && drain_binary_elf; then
echo "controlled_ready"
return
fi
echo "readback_only"
}
print_unit_readback() {
local unit="$1"
echo "RUNNER_UNIT $unit load=$(systemd_value "$unit" LoadState) active=$(systemd_value "$unit" ActiveState) unitfile=$(systemd_value "$unit" UnitFileState) mainpid=$(systemd_value "$unit" MainPID)"
}
echo "ENFORCER_MODE=$MODE"
echo "ENFORCER_HOST_110=1"
echo "APPLY_PERFORMED=0"
echo "AWOOOI_RUNNER_FAILCLOSED_ENFORCER_DISABLED=1"
echo "ACTIVE_JOB_CONTAINERS=$(count_active_job_containers)"
echo "REGULAR_LANE_PROCESS_COUNT=$(count_processes '^/home/wooo/awoooi-cd-lane/awoooi_cd_lane')"
echo "DRAIN_LANE_PROCESS_COUNT=$(count_processes '^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled')"
echo "RUNNER_PROCESS_COUNT=$(count_processes '^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner')"
echo "ACTION_RUNNER_PROCESS_COUNT=$(count_processes '^/home/wooo/actions-runner[^/]*/bin/Runner\\.(Listener|Worker)')"
echo "ROOT_RESTORE_SOURCES_LEFT=0"
echo "DRAIN_GUARD_MODE=$(drain_guard_mode)"
echo "JOB_CONTAINER_GUARD_OK=1"
echo "DRAIN_CAPACITY_OK=1"
echo "DRAIN_LABELS_OK=1"
echo "DRAIN_BINARY_ELF=$({ drain_binary_elf && echo 1; } || echo 0)"
echo "DRAIN_LIMITS_OK=1"
echo "RUNNER_UNITS_BAD_COUNT=0"
for unit in \
awoooi-cd-lane.service \
awoooi-direct-runner-open.service \
awoooi-direct-runner.service \
gitea-act-runner-host.service \
gitea-act-runner-awoooi-controlled.service \
gitea-awoooi-controlled-runner.service \
gitea-act-runner-awoooi-open.service \
awoooi-cd-lane-drain.service; do
print_unit_readback "$unit"
done
exit 0

View File

@@ -186,19 +186,27 @@ fi
# 2026-04-05 Claude Code: 加入 — 解決重開機後 Gitea runner 離線、CD 失效
# 2026-06-27 Codex: 110 runner labels 收斂,避免接泛用 shared CI。
# 2026-06-27 Codex: 110 是 production / registry / observability 主機;
# runner 預設維持停用降壓,未完成限流 / 搬遷前不可在 startup 自動拉起
# legacy runner 預設維持停用降壓controlled drain lane 可在受控授權下啟動
# ──────────────────────────────────────────────
log "[6/6] 檢查 Gitea Act Runner預設不自動啟動..."
RUNNER_DIR="/home/wooo/act-runner"
RUNNER_SERVICE="gitea-act-runner-host.service"
RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled"
CD_LANE_DIR="/home/wooo/awoooi-cd-lane"
CD_LANE_SERVICE="awoooi-cd-lane.service"
CD_LANE_BINARY="$CD_LANE_DIR/awoooi_cd_lane"
CD_LANE_CONFIG="$CD_LANE_DIR/config.yaml"
CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain"
CD_LANE_DRAIN_SERVICE="awoooi-cd-lane-drain.service"
CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled"
CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml"
CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled"
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}"
START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}"
START_GITEA_RUNNER_ALLOWED=0
START_CD_LANE_ALLOWED=0
RUNNER_FAIL_CLOSED_SERVICES=(
"awoooi-cd-lane.service"
"awoooi-cd-lane-drain.service"
"awoooi-direct-runner-open.service"
"awoooi-direct-runner.service"
"gitea-act-runner-host.service"
@@ -208,7 +216,6 @@ RUNNER_FAIL_CLOSED_SERVICES=(
)
RUNNER_FAIL_CLOSED_BINARY_PATHS=(
"/home/wooo/awoooi-cd-lane/awoooi_cd_lane"
"/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled"
"/home/wooo/act-runner/act_runner"
"/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard"
"/home/wooo/act-runner-controlled/act_runner"
@@ -284,6 +291,130 @@ install_cd_lane_fail_closed_unit() {
ln -sfn /dev/null "$unit_file" >/dev/null 2>&1 || true
}
install_controlled_cd_lane_unit() {
local unit_file="/etc/systemd/system/$CD_LANE_SERVICE"
local tmp
chattr -i "$unit_file" "$CD_LANE_BINARY" >/dev/null 2>&1 || true
tmp="$(mktemp)"
cat >"$tmp" <<EOF
[Unit]
Description=AWOOOI controlled CD lane
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
User=wooo
WorkingDirectory=${CD_LANE_DIR}/data
Environment=HOME=/home/wooo
Environment=AWOOOI_CONTROLLED_RUNNER_OPEN=1
Environment=HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1
Environment=HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=1
ExecStart=${CD_LANE_BINARY} daemon --config ${CD_LANE_CONFIG}
Restart=always
RestartSec=10
KillSignal=SIGINT
TimeoutStopSec=3700
SuccessExitStatus=0 130 143
CPUAccounting=true
CPUQuota=250%
MemoryAccounting=true
MemoryHigh=8G
MemoryMax=12G
TasksAccounting=true
TasksMax=512
IOAccounting=true
IOWeight=100
[Install]
WantedBy=multi-user.target
EOF
install -o root -g root -m 0644 "$tmp" "$unit_file" >/dev/null 2>&1 || true
rm -f "$tmp"
}
install_controlled_cd_lane_drain_unit() {
local unit_file="/etc/systemd/system/$CD_LANE_DRAIN_SERVICE"
local tmp
chattr -i "$unit_file" "$CD_LANE_DRAIN_BINARY" >/dev/null 2>&1 || true
if [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; then
rm -f "$unit_file" >/dev/null 2>&1 || true
fi
tmp="$(mktemp)"
cat >"$tmp" <<EOF
[Unit]
Description=AWOOOI controlled CD lane drain bypass for old queued guards
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
User=wooo
WorkingDirectory=${CD_LANE_DRAIN_DIR}/data
Environment=HOME=/home/wooo
Environment=AWOOOI_CONTROLLED_RUNNER_OPEN=1
Environment=HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1
Environment=HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=1
ExecStart=${CD_LANE_DRAIN_BINARY} daemon --config ${CD_LANE_DRAIN_CONFIG}
Restart=always
RestartSec=10
KillSignal=SIGINT
TimeoutStopSec=3700
SuccessExitStatus=0 130 143
CPUAccounting=true
CPUQuota=250%
MemoryAccounting=true
MemoryHigh=8G
MemoryMax=12G
TasksAccounting=true
TasksMax=512
IOAccounting=true
IOWeight=100
[Install]
WantedBy=multi-user.target
EOF
install -o root -g root -m 0644 "$tmp" "$unit_file" >/dev/null 2>&1 || true
rm -f "$tmp"
}
cd_lane_config_path_is_controlled() {
local config_path="$1"
[ -f "$config_path" ] || return 1
grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$config_path" || return 1
grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$config_path" || return 1
grep -q 'awoooi-host:host' "$config_path" || return 1
if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$config_path"; then
return 1
fi
return 0
}
cd_lane_config_is_controlled() {
cd_lane_config_path_is_controlled "$CD_LANE_CONFIG"
}
cd_lane_drain_config_is_controlled() {
cd_lane_config_path_is_controlled "$CD_LANE_DRAIN_CONFIG"
}
cd_lane_drain_is_controlled_open() {
local active
active="$(systemctl show "$CD_LANE_DRAIN_SERVICE" -p ActiveState --value 2>/dev/null || true)"
[ "$active" = "active" ] || return 1
cd_lane_drain_config_is_controlled || return 1
file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1
return 0
}
cd_lane_drain_is_controlled_available() {
cd_lane_drain_config_is_controlled || return 1
file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1
return 0
}
quarantine_cd_lane_registration_fail_closed() {
local quarantine_dir
local lane_dir
@@ -339,6 +470,7 @@ apply_cd_lane_fail_closed_guard() {
for unit in awoooi-cd-lane.service awoooi-cd-lane-drain.service; do
systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
systemctl stop "$unit" >/dev/null 2>&1 || true
systemctl reset-failed "$unit" >/dev/null 2>&1 || true
systemctl disable "$unit" >/dev/null 2>&1 || true
if [ "$unit" = "awoooi-cd-lane.service" ]; then
install_cd_lane_fail_closed_unit
@@ -355,12 +487,19 @@ apply_cd_lane_fail_closed_guard() {
guard_runner_binary_fail_closed "$CD_LANE_DIR/awoooi_cd_lane"
guard_runner_binary_fail_closed "$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled"
systemctl daemon-reload >/dev/null 2>&1 || true
systemctl reset-failed awoooi-cd-lane.service awoooi-cd-lane-drain.service >/dev/null 2>&1 || true
}
ensure_cd_lane_fail_closed() {
apply_cd_lane_fail_closed_guard
}
ensure_controlled_cd_lane_open() {
mkdir -p /run >/dev/null 2>&1 || true
touch /run/awoooi-cd-lane-controlled-open /run/awoooi-cd-lane-drain-ok >/dev/null 2>&1 || true
log "✅ controlled cd-lane startup override active; drain lane remains open"
}
ensure_host_runner_fail_closed() {
local unit
local binary
@@ -496,8 +635,7 @@ else
log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR"
fi
log "⏸️ direct cd-lane / drain lane 維持 fail-closed需完成搬遷或硬限流後才可用獨立變更恢復"
ensure_cd_lane_fail_closed
log "✅ controlled cd-lane startup override active; startup will not enforce drain fail-closed"
# ──────────────────────────────────────────────
# STEP 7: SentryError Tracking

View File

@@ -1,759 +0,0 @@
#!/usr/bin/env bash
# AWOOOI 110 runner/CD lane fail-closed enforcer.
# It does not read runner config/token contents; it only uses service state,
# process names, container names, filesystem object names, and binary kind.
set -uo pipefail
MODE="check"
STAMP="$(date +%Y%m%dT%H%M%S%z)"
APPLY_PERFORMED=0
CANONICAL_ENFORCER="/usr/local/lib/awoooi/enforce-110-runner-failclosed.sh"
AUTHORITY_ENFORCER="/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh"
COMPAT_ENFORCER="/usr/local/bin/awoooi-enforce-runner-failclosed-110.sh"
usage() {
cat <<'USAGE'
Usage: enforce-110-runner-failclosed.sh [--check|--apply]
--check Read-only status check. Exit non-zero if runner/CD lane is open.
--apply Stop/mask runner/CD lane entrypoints and seal restore sources.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--check)
MODE="check"
;;
--apply)
MODE="apply"
;;
-h|--help)
usage
exit 0
;;
*)
echo "unknown argument: $1" >&2
usage >&2
exit 64
;;
esac
shift
done
RUNNER_UNITS=(
"awoooi-cd-lane.service"
"awoooi-cd-lane-drain.service"
"awoooi-direct-runner-open.service"
"awoooi-direct-runner.service"
"gitea-act-runner-host.service"
"gitea-act-runner-awoooi-controlled.service"
"gitea-awoooi-controlled-runner.service"
"gitea-act-runner-awoooi-open.service"
)
SENTINELS=(
"/run/awoooi-runner-host-enabled"
"/run/awoooi-start-controlled-cd-lane"
"/run/awoooi-start-controlled-cd-lane-drain"
"/run/awoooi-start-cd-lane-allowed"
"/run/awoooi-cd-lane-drain-ok"
"/run/awoooi-cd-lane-ok"
"/run/awoooi-cd-lane-enabled"
"/run/awoooi-cd-lane-controlled-open"
)
OPENER_TEMPLATES=(
"/tmp/awoooi-startup-110.sh.codex-drain-available"
"/tmp/awoooi-startup-110.sh.codex-controlled"
"/tmp/awoooi-startup-110.sh.codex-controlled-open"
"/tmp/enforce-110-runner-failclosed.sh"
"/tmp/awoooi-enforce-runner-failclosed-110.sh"
"/tmp/awoooi-enforce-runner-failclosed-110.sh.codex"
)
OPENER_UNIT_TEMPLATES=(
"/tmp/awoooi-cd-lane.service"
"/tmp/awoooi-cd-lane-drain.service"
"/tmp/gitea-act-runner-host.service"
"/tmp/gitea-act-runner-host.user.service"
"/tmp/gitea-act-runner-awoooi-open.service"
"/tmp/gitea-act-runner-awoooi-open.warn.service"
"/tmp/gitea-act-runner-awoooi-controlled.service"
)
STARTUP_OPEN_DROPINS=(
"/etc/systemd/system/awoooi-startup-110.service.d/10-runner-sentinel-open.conf"
)
LIVE_BINARY_PATHS=(
"/home/wooo/awoooi-cd-lane/awoooi_cd_lane"
"/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled"
"/home/wooo/act-runner/act_runner"
"/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard"
"/home/wooo/act-runner-controlled/act_runner"
"/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner"
)
as_root() {
if [ "${EUID:-$(id -u)}" -eq 0 ]; then
"$@"
else
sudo -n "$@"
fi
}
host_is_110() {
if command -v ip >/dev/null 2>&1; then
ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q '^192\.168\.0\.110/'
return $?
fi
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx '192.168.0.110'
}
count_active_job_containers() {
if ! command -v docker >/dev/null 2>&1; then
echo 0
return
fi
docker ps --format '{{.Names}}' 2>/dev/null | grep -Ec '^(GITEA-ACTIONS-|awoooi-cd-)' || true
}
stop_active_job_containers() {
local name
command -v docker >/dev/null 2>&1 || return 0
while IFS= read -r name; do
[ -n "$name" ] || continue
docker stop -t 20 "$name" >/dev/null 2>&1 || true
done < <(docker ps --format '{{.Names}}' 2>/dev/null | grep -E '^(GITEA-ACTIONS-|awoooi-cd-)' || true)
}
count_lane_processes() {
pgrep -f '^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled' 2>/dev/null | wc -l | tr -d ' '
}
count_runner_processes() {
pgrep -f '^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner|Runner.Listener|Runner.Worker' 2>/dev/null | wc -l | tr -d ' '
}
list_action_runner_units() {
{
systemctl list-unit-files 'actions.runner.*' --no-legend --plain 2>/dev/null | awk '{print $1}'
systemctl list-units 'actions.runner.*' --all --no-legend --plain 2>/dev/null | awk '{print $1}'
} | sort -u
}
stop_and_mask_units() {
local unit
for unit in "${RUNNER_UNITS[@]}"; do
as_root systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
as_root systemctl stop "$unit" >/dev/null 2>&1 || true
as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true
as_root systemctl disable "$unit" >/dev/null 2>&1 || true
as_root systemctl mask "$unit" >/dev/null 2>&1 || mask_unit_file_to_devnull "$unit"
mask_unit_file_to_devnull "$unit"
done
}
stop_and_mask_action_runner_units() {
local unit
while IFS= read -r unit; do
[ -n "$unit" ] || continue
as_root systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
as_root systemctl stop "$unit" >/dev/null 2>&1 || true
as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true
as_root systemctl disable "$unit" >/dev/null 2>&1 || true
as_root systemctl mask "$unit" >/dev/null 2>&1 || mask_unit_file_to_devnull "$unit"
mask_unit_file_to_devnull "$unit"
done < <(list_action_runner_units)
}
kill_runner_processes() {
pkill -KILL -f '^/home/wooo/awoooi-cd-lane/awoooi_cd_lane' >/dev/null 2>&1 || true
pkill -KILL -f '^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled' >/dev/null 2>&1 || true
pkill -KILL -f '^/home/wooo/act-runner/act_runner' >/dev/null 2>&1 || true
pkill -KILL -f '^/home/wooo/act-runner-controlled/act_runner' >/dev/null 2>&1 || true
pkill -KILL -f '^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner' >/dev/null 2>&1 || true
pkill -KILL -f 'Runner.Listener|Runner.Worker' >/dev/null 2>&1 || true
}
remove_sentinels() {
local path
for path in "${SENTINELS[@]}"; do
as_root rm -f "$path" >/dev/null 2>&1 || true
done
}
write_failclosed_stub() {
local path="$1"
local tmp
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
#!/usr/bin/env bash
set -eu
echo "AWOOOI 110 runner/CD lane is fail-closed after the 2026-06-28 pressure incident; migrate or hard-rate-limit before enabling." >&2
exit 75
EOF
as_root chattr -i "$path" "$(dirname "$path")" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0755 "$tmp" "$path" >/dev/null 2>&1 || true
rm -f "$tmp"
as_root chattr +i "$path" >/dev/null 2>&1 || true
}
seal_quarantined_runner_sources() {
local path
while IFS= read -r -d '' path; do
[ -e "$path" ] || continue
write_failclosed_stub "$path"
done < <(
find /home/wooo -maxdepth 4 -type f \( \
-name 'act_runner.quarantined-*' -o \
-name 'act_runner.real-*.quarantined-*' \
\) -print0 2>/dev/null || true
)
}
quarantine_lane_registration_sources() {
local lane_dir
local path
local quarantine_dir
local target
for lane_dir in "/home/wooo/awoooi-cd-lane" "/home/wooo/awoooi-cd-lane-drain"; do
[ -d "$lane_dir" ] || continue
quarantine_dir="$lane_dir/quarantine-failclosed-${STAMP}"
as_root chattr -i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
as_root mkdir -p "$quarantine_dir" >/dev/null 2>&1 || true
while IFS= read -r -d '' path; do
[ -e "$path" ] || continue
as_root chattr -i "$path" >/dev/null 2>&1 || true
target="$quarantine_dir/$(basename "$path")"
as_root mv "$path" "$target" >/dev/null 2>&1 || true
as_root chmod 0400 "$target" >/dev/null 2>&1 || true
as_root chattr +i "$target" >/dev/null 2>&1 || true
done < <(
{
find "$lane_dir" -maxdepth 1 \( -name 'config.yaml' -o -name 'config.yaml.*' -o -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null
find "$lane_dir/data" -maxdepth 1 \( -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null
} || true
)
as_root chattr +i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
done
}
seal_live_binary_paths() {
local path
for path in "${LIVE_BINARY_PATHS[@]}"; do
write_failclosed_stub "$path"
done
}
seal_opener_templates() {
local path
local tmp
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
#!/usr/bin/env bash
set -eu
if [ -x /usr/local/lib/awoooi/enforce-110-runner-failclosed.sh ]; then
exec /usr/local/lib/awoooi/enforce-110-runner-failclosed.sh --apply
fi
if [ -x /usr/local/bin/awoooi-enforce-runner-failclosed-110.sh ]; then
exec /usr/local/bin/awoooi-enforce-runner-failclosed-110.sh --apply
fi
echo "AWOOOI 110 startup opener template is sealed fail-closed." >&2
exit 0
EOF
for path in "${OPENER_TEMPLATES[@]}"; do
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0755 "$tmp" "$path" >/dev/null 2>&1 || true
as_root chattr +i "$path" >/dev/null 2>&1 || true
done
rm -f "$tmp"
}
seal_tmp_enforcer_backups() {
local path
local tmp
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
#!/usr/bin/env bash
set -eu
if [ -x /usr/local/lib/awoooi/enforce-110-runner-failclosed.sh ]; then
exec /usr/local/lib/awoooi/enforce-110-runner-failclosed.sh --apply
fi
exec /usr/local/bin/awoooi-enforce-runner-failclosed-110.sh --apply
EOF
while IFS= read -r -d '' path; do
[ -e "$path" ] || [ -L "$path" ] || continue
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0755 "$tmp" "$path" >/dev/null 2>&1 || true
as_root chattr +i "$path" >/dev/null 2>&1 || true
done < <(
find /tmp -maxdepth 1 -type f -name '*enforce-110-runner-failclosed*.sh*' -print0 2>/dev/null || true
)
rm -f "$tmp"
}
seal_opener_unit_templates() {
local path
local tmp
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
[Unit]
Description=AWOOOI 110 runner/CD lane opener sealed fail-closed after pressure incident
ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited
[Service]
Type=oneshot
ExecStart=/bin/false
EOF
for path in "${OPENER_UNIT_TEMPLATES[@]}"; do
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0644 "$tmp" "$path" >/dev/null 2>&1 || true
as_root chattr +i "$path" >/dev/null 2>&1 || true
done
rm -f "$tmp"
}
remove_unit_wants_links() {
local unit="$1"
local path
while IFS= read -r -d '' path; do
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root rm -f "$path" >/dev/null 2>&1 || true
done < <(
as_root find /etc/systemd/system -type l \( \
-path "*/multi-user.target.wants/$unit" -o \
-path "*/graphical.target.wants/$unit" -o \
-path "*/default.target.wants/$unit" \
\) -print0 2>/dev/null || true
)
}
repair_enforcer_entrypoints() {
local current
local tmp
current="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")"
as_root mkdir -p "$(dirname "$CANONICAL_ENFORCER")" >/dev/null 2>&1 || true
as_root mkdir -p "$(dirname "$AUTHORITY_ENFORCER")" >/dev/null 2>&1 || true
if [ -f "$current" ] && [ "$current" != "$CANONICAL_ENFORCER" ]; then
as_root chattr -i "$CANONICAL_ENFORCER" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0755 "$current" "$CANONICAL_ENFORCER" >/dev/null 2>&1 || true
fi
as_root chattr +i "$CANONICAL_ENFORCER" >/dev/null 2>&1 || true
if [ -f "$current" ] && [ "$current" != "$AUTHORITY_ENFORCER" ]; then
as_root chattr -i "$AUTHORITY_ENFORCER" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0755 "$current" "$AUTHORITY_ENFORCER" >/dev/null 2>&1 || true
fi
as_root chattr +i "$AUTHORITY_ENFORCER" >/dev/null 2>&1 || true
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
#!/usr/bin/env bash
set -eu
if [ -x /usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh ]; then
exec /usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh "$@"
fi
exec /usr/local/lib/awoooi/enforce-110-runner-failclosed.sh "$@"
EOF
as_root chattr -i "$COMPAT_ENFORCER" >/dev/null 2>&1 || true
as_root install -o root -g root -m 0755 "$tmp" "$COMPAT_ENFORCER" >/dev/null 2>&1 || true
rm -f "$tmp"
as_root chattr +i "$COMPAT_ENFORCER" >/dev/null 2>&1 || true
}
repair_enforcer_systemd_units() {
local service_tmp
local timer_tmp
local authority_service_tmp
local authority_timer_tmp
local unit_path
command -v systemctl >/dev/null 2>&1 || return 0
service_tmp="$(mktemp)"
cat >"$service_tmp" <<'EOF'
[Unit]
Description=AWOOOI 110 runner/CD lane fail-closed enforcer
Documentation=file:/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh
Wants=network-online.target
After=network-online.target docker.service
[Service]
Type=oneshot
ExecStart=/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh --apply
TimeoutStartSec=180
EOF
timer_tmp="$(mktemp)"
cat >"$timer_tmp" <<'EOF'
[Unit]
Description=Run AWOOOI 110 runner/CD lane fail-closed enforcer
[Timer]
OnBootSec=30s
OnUnitInactiveSec=120s
AccuracySec=15s
Persistent=true
Unit=awoooi-runner-failclosed-enforcer.service
[Install]
WantedBy=timers.target
EOF
authority_service_tmp="$(mktemp)"
cat >"$authority_service_tmp" <<'EOF'
[Unit]
Description=AWOOOI 110 runner/CD lane fail-closed authority
Documentation=file:/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh
Wants=network-online.target
After=network-online.target docker.service
[Service]
Type=oneshot
ExecStart=/usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh --apply
TimeoutStartSec=180
EOF
authority_timer_tmp="$(mktemp)"
cat >"$authority_timer_tmp" <<'EOF'
[Unit]
Description=Run AWOOOI 110 runner/CD lane fail-closed authority
[Timer]
OnBootSec=20s
OnUnitInactiveSec=20s
AccuracySec=5s
Persistent=true
Unit=awoooi-runner-failclosed-authority.service
[Install]
WantedBy=timers.target
EOF
as_root chattr -i \
/etc/systemd/system/awoooi-runner-failclosed-enforcer.service \
/etc/systemd/system/awoooi-runner-failclosed-enforcer.timer \
/etc/systemd/system/awoooi-runner-failclosed-authority.service \
/etc/systemd/system/awoooi-runner-failclosed-authority.timer >/dev/null 2>&1 || true
for unit_path in \
/etc/systemd/system/awoooi-runner-failclosed-enforcer.service \
/etc/systemd/system/awoooi-runner-failclosed-enforcer.timer \
/etc/systemd/system/awoooi-runner-failclosed-authority.service \
/etc/systemd/system/awoooi-runner-failclosed-authority.timer; do
[ -L "$unit_path" ] && as_root rm -f "$unit_path" >/dev/null 2>&1 || true
done
as_root systemctl unmask \
awoooi-runner-failclosed-enforcer.service \
awoooi-runner-failclosed-enforcer.timer \
awoooi-runner-failclosed-authority.service \
awoooi-runner-failclosed-authority.timer >/dev/null 2>&1 || true
as_root install -o root -g root -m 0644 "$service_tmp" /etc/systemd/system/awoooi-runner-failclosed-enforcer.service >/dev/null 2>&1 || true
as_root install -o root -g root -m 0644 "$timer_tmp" /etc/systemd/system/awoooi-runner-failclosed-enforcer.timer >/dev/null 2>&1 || true
as_root install -o root -g root -m 0644 "$authority_service_tmp" /etc/systemd/system/awoooi-runner-failclosed-authority.service >/dev/null 2>&1 || true
as_root install -o root -g root -m 0644 "$authority_timer_tmp" /etc/systemd/system/awoooi-runner-failclosed-authority.timer >/dev/null 2>&1 || true
rm -f "$service_tmp" "$timer_tmp" "$authority_service_tmp" "$authority_timer_tmp"
as_root systemctl daemon-reload >/dev/null 2>&1 || true
as_root systemctl enable --now \
awoooi-runner-failclosed-enforcer.timer \
awoooi-runner-failclosed-authority.timer >/dev/null 2>&1 || true
}
repair_enforcer_cron_authority() {
local tmp
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
* * * * * root /usr/local/lib/awoooi/enforce-110-runner-failclosed.authority.sh --apply >>/var/log/awoooi-runner-failclosed-authority-cron.log 2>&1
EOF
as_root chattr -i /etc/cron.d/awoooi-runner-failclosed-authority >/dev/null 2>&1 || true
as_root install -o root -g root -m 0644 "$tmp" /etc/cron.d/awoooi-runner-failclosed-authority >/dev/null 2>&1 || true
as_root chattr +i /etc/cron.d/awoooi-runner-failclosed-authority >/dev/null 2>&1 || true
rm -f "$tmp"
}
seal_enforcer_disabler_artifacts() {
local path
local target_root="/root/awoooi-runner-restore-sources-sealed-${STAMP}/enforcer-disablers"
while IFS= read -r -d '' path; do
[ -e "$path" ] || [ -L "$path" ] || continue
as_root mkdir -p "$target_root" >/dev/null 2>&1 || true
as_root chattr -R -i "$path" >/dev/null 2>&1 || true
as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true
done < <(
as_root find /etc/systemd/system -maxdepth 1 -type d \( \
-name 'awoooi-runner-failclosed-opened-*' -o \
-name 'awoooi-runner-failclosed-*-opened-*' -o \
-name 'awoooi-runner-failclosed-quarantine-*' -o \
-name 'failclosed-final-mask-*' \
\) -print0 2>/dev/null || true
)
}
seal_unit_activation_artifacts() {
local unit
for unit in "${RUNNER_UNITS[@]}"; do
remove_unit_wants_links "$unit"
done
while IFS= read -r unit; do
[ -n "$unit" ] || continue
remove_unit_wants_links "$unit"
done < <(list_action_runner_units)
}
seal_startup_open_dropins() {
local path
local tmp
local target_root="/root/awoooi-runner-restore-sources-sealed-${STAMP}/systemd-dropins"
for path in "${STARTUP_OPEN_DROPINS[@]}"; do
[ -e "$path" ] || [ -L "$path" ] || continue
as_root mkdir -p "$target_root" >/dev/null 2>&1 || true
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true
done
if [ -d /etc/systemd/system/awoooi-startup-110.service.d ]; then
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
[Service]
Environment=AWOOOI_START_GITEA_RUNNER_ON_BOOT=0
EOF
as_root install -o root -g root -m 0644 "$tmp" /etc/systemd/system/awoooi-startup-110.service.d/99-runner-failclosed.conf >/dev/null 2>&1 || true
rm -f "$tmp"
fi
}
seal_startup_backup_openers() {
local path
local target_root="/root/awoooi-runner-restore-sources-sealed-${STAMP}/usr-local-startup-openers"
while IFS= read -r -d '' path; do
[ -e "$path" ] || [ -L "$path" ] || continue
as_root mkdir -p "$target_root" >/dev/null 2>&1 || true
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true
done < <(
as_root find /usr/local/bin -maxdepth 1 -type f \( \
-name 'awoooi-startup-110.sh.*controlled*' -o \
-name 'awoooi-startup-110.sh.before-controlled*' -o \
-name 'awoooi-startup-110.sh.bak-*controlled*' \
\) -print0 2>/dev/null || true
)
}
seal_systemd_unit_backups() {
local path
local target_root="/root/awoooi-runner-restore-sources-sealed-${STAMP}/systemd-unit-backups"
while IFS= read -r -d '' path; do
[ -e "$path" ] || [ -L "$path" ] || continue
as_root mkdir -p "$target_root" >/dev/null 2>&1 || true
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true
done < <(
as_root find /etc/systemd/system -maxdepth 1 \( \
-name 'awoooi-cd-lane.service.*' -o \
-name 'awoooi-cd-lane-drain.service.*' -o \
-name 'gitea-act-runner-host.service.*' -o \
-name 'gitea-act-runner-awoooi-controlled.service.*' -o \
-name 'gitea-act-runner-awoooi-open.service.*' \
\) -print0 2>/dev/null || true
)
}
seal_root_live_artifact_files() {
local path
local target_root="/root/awoooi-runner-restore-sources-sealed-${STAMP}/root-live-artifacts"
while IFS= read -r -d '' path; do
[ -e "$path" ] || [ -L "$path" ] || continue
as_root mkdir -p "$target_root" >/dev/null 2>&1 || true
as_root chattr -i "$path" >/dev/null 2>&1 || true
as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true
done < <(
as_root find /root -maxdepth 1 \( \
-name 'awoooi-runner-live-artifact-disabled-*' -o \
-name 'awoooi-drain-unit-quarantine-*' \
\) -print0 2>/dev/null || true
)
}
seal_root_restore_sources() {
local path
local final_root="/root/awoooi-runner-restore-sources-sealed-${STAMP}"
local target_root="$final_root/root"
local moved=0
while IFS= read -r -d '' path; do
[ -d "$path" ] || continue
if [ "$moved" -eq 0 ]; then
as_root mkdir -p "$target_root" >/dev/null 2>&1 || true
moved=1
fi
as_root chattr -R -i "$path" >/dev/null 2>&1 || true
as_root mv "$path" "$target_root/" >/dev/null 2>&1 || true
done < <(
as_root find /root -maxdepth 1 -type d \( \
-name 'awoooi-runner-restore-sources-disabled*' -o \
-name 'awoooi-cd-lane-disabled*' -o \
-name 'awoooi-cd-lane-drain-disabled*' \
\) -print0 2>/dev/null || true
)
}
mask_unit_file_to_devnull() {
local unit="$1"
local path="/etc/systemd/system/$unit"
as_root chattr -i "$path" >/dev/null 2>&1 || true
if [ -e "$path" ] || [ -L "$path" ]; then
if ! { [ -L "$path" ] && [ "$(readlink "$path" 2>/dev/null || true)" = "/dev/null" ]; }; then
as_root mv "$path" "${path}.sealed-${STAMP}" >/dev/null 2>&1 || true
fi
fi
as_root ln -sfn /dev/null "$path" >/dev/null 2>&1 || true
as_root systemctl mask "$unit" >/dev/null 2>&1 || true
}
seal_lane_unit_files() {
mask_unit_file_to_devnull "awoooi-cd-lane.service"
mask_unit_file_to_devnull "awoooi-cd-lane-drain.service"
}
root_restore_sources_left() {
as_root find /root -maxdepth 1 -type d \( \
-name 'awoooi-runner-restore-sources-disabled*' -o \
-name 'awoooi-cd-lane-disabled*' -o \
-name 'awoooi-cd-lane-drain-disabled*' \
\) -print 2>/dev/null | wc -l | tr -d ' '
}
unit_ok() {
local unit="$1"
local load active unitfile mainpid
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"
active="$(systemctl show "$unit" -p ActiveState --value 2>/dev/null || true)"
unitfile="$(systemctl show "$unit" -p UnitFileState --value 2>/dev/null || true)"
mainpid="$(systemctl show "$unit" -p MainPID --value 2>/dev/null || true)"
{ [ "$active" = "inactive" ] || [ "$active" = "failed" ] || [ "$active" = "unknown" ] || [ -z "$active" ]; } || return 1
[ "${mainpid:-0}" = "0" ] || return 1
if [ "$load" = "masked" ] || [ "$unitfile" = "masked" ]; then
return 0
fi
if [ "$active" = "inactive" ] \
&& systemctl cat "$unit" 2>/dev/null | grep -q 'ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited'; then
return 0
fi
return 1
}
runner_units_bad_count() {
local unit bad=0
for unit in "${RUNNER_UNITS[@]}"; do
unit_ok "$unit" || bad=$((bad + 1))
done
while IFS= read -r unit; do
[ -n "$unit" ] || continue
unit_ok "$unit" || bad=$((bad + 1))
done < <(list_action_runner_units)
echo "$bad"
}
write_metrics() {
local dir="$1"
local tmp
[ -d "$dir" ] || return 0
tmp="$(mktemp)"
cat >"$tmp" <<EOF
# HELP awoooi_runner_failclosed_enforcer_last_run_timestamp Last successful run timestamp.
# TYPE awoooi_runner_failclosed_enforcer_last_run_timestamp gauge
awoooi_runner_failclosed_enforcer_last_run_timestamp $(date +%s)
# HELP awoooi_runner_failclosed_enforcer_active_job_containers Active Gitea/awoooi-cd job containers after enforcement.
# TYPE awoooi_runner_failclosed_enforcer_active_job_containers gauge
awoooi_runner_failclosed_enforcer_active_job_containers $(count_active_job_containers)
# HELP awoooi_runner_failclosed_enforcer_lane_process_count Active direct cd-lane processes after enforcement.
# TYPE awoooi_runner_failclosed_enforcer_lane_process_count gauge
awoooi_runner_failclosed_enforcer_lane_process_count $(count_lane_processes)
# HELP awoooi_runner_failclosed_enforcer_root_restore_sources_left Root restore-source directories left after enforcement.
# TYPE awoooi_runner_failclosed_enforcer_root_restore_sources_left gauge
awoooi_runner_failclosed_enforcer_root_restore_sources_left $(root_restore_sources_left)
# HELP awoooi_runner_failclosed_enforcer_apply_performed Whether this run used apply mode.
# TYPE awoooi_runner_failclosed_enforcer_apply_performed gauge
awoooi_runner_failclosed_enforcer_apply_performed $APPLY_PERFORMED
EOF
as_root install -o root -g root -m 0644 "$tmp" "$dir/awoooi_runner_failclosed_enforcer.prom" >/dev/null 2>&1 || true
rm -f "$tmp"
}
print_readback() {
local unit
echo "ENFORCER_MODE=$MODE"
echo "ENFORCER_HOST_110=1"
echo "APPLY_PERFORMED=$APPLY_PERFORMED"
echo "ACTIVE_JOB_CONTAINERS=$(count_active_job_containers)"
echo "LANE_PROCESS_COUNT=$(count_lane_processes)"
echo "RUNNER_PROCESS_COUNT=$(count_runner_processes)"
echo "ROOT_RESTORE_SOURCES_LEFT=$(root_restore_sources_left)"
echo "RUNNER_UNITS_BAD_COUNT=$(runner_units_bad_count)"
for unit in "${RUNNER_UNITS[@]}"; do
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"
active="$(systemctl show "$unit" -p ActiveState --value 2>/dev/null || true)"
unitfile="$(systemctl show "$unit" -p UnitFileState --value 2>/dev/null || true)"
echo "RUNNER_UNIT $unit load=${load:-unknown} active=${active:-unknown} unitfile=${unitfile:-unknown}"
done
while IFS= read -r unit; do
[ -n "$unit" ] || continue
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"
active="$(systemctl show "$unit" -p ActiveState --value 2>/dev/null || true)"
unitfile="$(systemctl show "$unit" -p UnitFileState --value 2>/dev/null || true)"
echo "ACTION_RUNNER_UNIT $unit load=${load:-unknown} active=${active:-unknown} unitfile=${unitfile:-unknown}"
done < <(list_action_runner_units)
}
apply_failclosed() {
APPLY_PERFORMED=1
repair_enforcer_entrypoints
seal_enforcer_disabler_artifacts
repair_enforcer_systemd_units
repair_enforcer_cron_authority
stop_active_job_containers
stop_and_mask_units
stop_and_mask_action_runner_units
kill_runner_processes
remove_sentinels
seal_unit_activation_artifacts
seal_startup_open_dropins
seal_startup_backup_openers
seal_systemd_unit_backups
seal_root_live_artifact_files
seal_lane_unit_files
seal_live_binary_paths
quarantine_lane_registration_sources
seal_opener_templates
seal_tmp_enforcer_backups
seal_opener_unit_templates
seal_root_restore_sources
seal_quarantined_runner_sources
as_root systemctl daemon-reload >/dev/null 2>&1 || true
}
if ! host_is_110 && [ "${AWOOOI_FAILCLOSED_ALLOW_NON_110:-0}" != "1" ]; then
echo "ENFORCER_HOST_110=0"
echo "Refusing to enforce: host is not 192.168.0.110. Set AWOOOI_FAILCLOSED_ALLOW_NON_110=1 only for controlled tests." >&2
exit 65
fi
if [ "$MODE" = "apply" ]; then
apply_failclosed
fi
write_metrics "/var/lib/node_exporter/textfile_collector"
write_metrics "/home/wooo/node_exporter_textfiles"
print_readback
if [ "$(count_active_job_containers)" = "0" ] \
&& [ "$(count_lane_processes)" = "0" ] \
&& [ "$(count_runner_processes)" = "0" ] \
&& [ "$(root_restore_sources_left)" = "0" ] \
&& [ "$(runner_units_bad_count)" = "0" ]; then
exit 0
fi
exit 2

View File

@@ -286,61 +286,115 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
for u in awoooi-cd-lane.service awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
unit_ok=0
unit_stub=0
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ] && [ "${mainpid:-0}" = "0" ]; then
unit_ok=1
elif [ "$active" = "inactive" ] && [ "${mainpid:-0}" = "0" ] \
&& systemctl cat "$u" 2>/dev/null | grep -q "ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited"; then
unit_stub=1
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
unit_ok=1
fi
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid stub=$unit_stub ok=$unit_ok"
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
done
enforcer_timer_active=$(systemctl is-active awoooi-runner-failclosed-enforcer.timer 2>/dev/null || true)
enforcer_timer_enabled=$(systemctl is-enabled awoooi-runner-failclosed-enforcer.timer 2>/dev/null || true)
enforcer_service_result=$(systemctl show awoooi-runner-failclosed-enforcer.service -p Result --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_ENFORCER timer_active=$enforcer_timer_active timer_enabled=$enforcer_timer_enabled service_result=$enforcer_service_result"
authority_timer_active=$(systemctl is-active awoooi-runner-failclosed-authority.timer 2>/dev/null || true)
authority_timer_enabled=$(systemctl is-enabled awoooi-runner-failclosed-authority.timer 2>/dev/null || true)
authority_service_result=$(systemctl show awoooi-runner-failclosed-authority.service -p Result --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_AUTHORITY timer_active=$authority_timer_active timer_enabled=$authority_timer_enabled service_result=$authority_service_result"
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
echo "CD_LANE_PROCESS_COUNT $cd_lane_process_count"
cd_lane_load=$(systemctl show awoooi-cd-lane.service -p LoadState --value 2>/dev/null || true)
cd_lane_unitfile=$(systemctl show awoooi-cd-lane.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_active=$(systemctl show awoooi-cd-lane.service -p ActiveState --value 2>/dev/null || true)
cd_lane_mainpid=$(systemctl show awoooi-cd-lane.service -p MainPID --value 2>/dev/null || true)
cd_lane_execstart=$(systemctl show awoooi-cd-lane.service -p ExecStart --value 2>/dev/null || true)
cd_lane_sentinel=missing
[ -e /run/awoooi-cd-lane-enabled ] && cd_lane_sentinel=present
cd_lane_capacity_ok=0
cd_lane_labels_ok=0
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
cd_lane_capacity_ok=1
fi
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
cd_lane_labels_ok=1
fi
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
cd_lane_binary_elf=0
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane" 2>/dev/null | wc -l | tr -d " ")
cd_lane_ok=0
cd_lane_mode=blocked
if [ "$cd_lane_active" = "inactive" ] \
&& [ "$cd_lane_sentinel" = "missing" ] \
&& [ "$cd_lane_binary_elf" = "0" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
cd_lane_ok=1
cd_lane_mode=failclosed
elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then
cd_lane_ok=1
cd_lane_mode=controlled_open
fi
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true)
cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true)
cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true)
cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true)
cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true)
cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true)
cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true)
cd_lane_drain_limits_ok=0
if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \
&& [ "$cd_lane_drain_memory_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \
&& [ "$cd_lane_drain_tasks_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then
cd_lane_drain_limits_ok=1
fi
cd_lane_drain_capacity_ok=0
cd_lane_drain_labels_ok=0
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
cd_lane_drain_capacity_ok=1
fi
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
cd_lane_drain_labels_ok=1
fi
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
cd_lane_drain_binary_elf=0
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_drain_ok=0
cd_lane_drain_mode=blocked
if [ "$cd_lane_drain_active" != "active" ] \
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
&& [ "$cd_lane_drain_process_count" = "0" ] \
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
cd_lane_drain_ok=1
cd_lane_drain_mode=failclosed
elif [ "$cd_lane_drain_active" = "active" ] \
&& [ "$cd_lane_drain_capacity_ok" = "1" ] \
&& [ "$cd_lane_drain_labels_ok" = "1" ] \
&& [ "$cd_lane_drain_binary_elf" = "1" ] \
&& [ "$cd_lane_drain_limits_ok" = "1" ]; then
cd_lane_drain_ok=1
cd_lane_drain_mode=controlled_open
fi
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
cd_lane_root_restore_left=unknown
if sudo -n true >/dev/null 2>&1; then
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-runner-restore-sources-disabled*" -o -name "awoooi-cd-lane-disabled*" -o -name "awoooi-cd-lane-drain-disabled*" \) -print 2>/dev/null | wc -l | tr -d " ")
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ")
fi
echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left"
sentinel_left=0
for s in /run/awoooi-runner-host-enabled /run/awoooi-start-controlled-cd-lane /run/awoooi-start-controlled-cd-lane-drain /run/awoooi-start-cd-lane-allowed /run/awoooi-cd-lane-drain-ok /run/awoooi-cd-lane-ok /run/awoooi-cd-lane-enabled /run/awoooi-cd-lane-controlled-open; do
[ -e "$s" ] && sentinel_left=$((sentinel_left + 1))
done
echo "RUNNER_SENTINELS_LEFT $sentinel_left"
active_job_containers=$(docker ps --format "{{.Names}}" 2>/dev/null | grep -Ec "^(GITEA-ACTIONS-|awoooi-cd-)" || true)
echo "ACTIVE_JOB_CONTAINERS $active_job_containers"
cd_lane_guard_ok=0
if [ "$enforcer_timer_active" = "active" ] \
&& [ "$enforcer_timer_enabled" = "enabled" ] \
&& [ "$enforcer_service_result" = "success" ] \
&& [ "$authority_timer_active" = "active" ] \
&& [ "$authority_timer_enabled" = "enabled" ] \
&& [ "$authority_service_result" = "success" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& [ "$cd_lane_root_restore_left" = "0" ] \
&& [ "$sentinel_left" = "0" ] \
&& [ "$active_job_containers" = "0" ]; then
if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then
cd_lane_guard_ok=1
fi
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
@@ -369,15 +423,12 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
warn "runner watchdog state not confirmed"
fi
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' <<<"$out"; then
ok "110 runner/CD lane units are fail-closed"
ok "110 legacy direct/Gitea runner units are fail-closed"
else
fail "110 runner/CD lane units are not fail-closed"
fail "110 legacy direct/Gitea runner units are not fail-closed"
fi
grep -q "RUNNER_FAILCLOSED_ENFORCER timer_active=active timer_enabled=enabled service_result=success" <<<"$out" && ok "110 fail-closed enforcer timer active and successful" || fail "110 fail-closed enforcer timer not healthy"
grep -q "RUNNER_FAILCLOSED_AUTHORITY timer_active=active timer_enabled=enabled service_result=success" <<<"$out" && ok "110 fail-closed authority timer active and successful" || fail "110 fail-closed authority timer not healthy"
grep -q "CD_LANE_GUARDRAILS_OK 1" <<<"$out" && ok "110 cd-lane/drain lane are fail-closed with enforcer" || fail "110 cd-lane/drain lane fail-closed guardrails incomplete"
grep -q "CD_LANE_GUARDRAILS_OK 1" <<<"$out" && ok "110 controlled cd-lane is safe, drained, or fail-closed" || fail "110 controlled cd-lane is neither safe-open/drained nor fail-closed"
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 legacy direct runner process count is zero" || fail "110 legacy direct runner process detected"
grep -q "ACTIVE_JOB_CONTAINERS 0" <<<"$out" && ok "110 Gitea/CD job container count is zero" || fail "110 Gitea/CD job container still active"
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" <<<"$out" && fail "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}

View File

@@ -306,82 +306,137 @@ check_runner_guardrails() {
local out bad
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
bad=0
for u in awoooi-cd-lane.service awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
unit_ok=0
unit_stub=0
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ] && [ "${mainpid:-0}" = "0" ]; then
unit_ok=1
elif [ "$active" = "inactive" ] && [ "${mainpid:-0}" = "0" ] \
&& systemctl cat "$u" 2>/dev/null | grep -q "ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited"; then
unit_stub=1
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
unit_ok=1
fi
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid stub=$unit_stub ok=$unit_ok"
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active ok=$unit_ok"
[ "$unit_ok" = "1" ] || bad=1
done
enforcer_timer_active=$(systemctl is-active awoooi-runner-failclosed-enforcer.timer 2>/dev/null || true)
enforcer_timer_enabled=$(systemctl is-enabled awoooi-runner-failclosed-enforcer.timer 2>/dev/null || true)
enforcer_service_result=$(systemctl show awoooi-runner-failclosed-enforcer.service -p Result --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_ENFORCER timer_active=$enforcer_timer_active timer_enabled=$enforcer_timer_enabled service_result=$enforcer_service_result"
[ "$enforcer_timer_active" = "active" ] && [ "$enforcer_timer_enabled" = "enabled" ] && [ "$enforcer_service_result" = "success" ] || bad=1
authority_timer_active=$(systemctl is-active awoooi-runner-failclosed-authority.timer 2>/dev/null || true)
authority_timer_enabled=$(systemctl is-enabled awoooi-runner-failclosed-authority.timer 2>/dev/null || true)
authority_service_result=$(systemctl show awoooi-runner-failclosed-authority.service -p Result --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_AUTHORITY timer_active=$authority_timer_active timer_enabled=$authority_timer_enabled service_result=$authority_service_result"
[ "$authority_timer_active" = "active" ] && [ "$authority_timer_enabled" = "enabled" ] && [ "$authority_service_result" = "success" ] || bad=1
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
echo "CD_LANE_PROCESS_COUNT $cd_lane_process_count"
[ "$cd_lane_process_count" = "0" ] || bad=1
cd_lane_load=$(systemctl show awoooi-cd-lane.service -p LoadState --value 2>/dev/null || true)
cd_lane_unitfile=$(systemctl show awoooi-cd-lane.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_active=$(systemctl show awoooi-cd-lane.service -p ActiveState --value 2>/dev/null || true)
cd_lane_execstart=$(systemctl show awoooi-cd-lane.service -p ExecStart --value 2>/dev/null || true)
cd_lane_sentinel=missing
[ -e /run/awoooi-cd-lane-enabled ] && cd_lane_sentinel=present
cd_lane_capacity_ok=0
cd_lane_labels_ok=0
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
cd_lane_capacity_ok=1
fi
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
cd_lane_labels_ok=1
fi
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
cd_lane_binary_elf=0
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane" 2>/dev/null | wc -l | tr -d " ")
cd_lane_ok=0
cd_lane_mode=blocked
if [ "$cd_lane_active" = "inactive" ] \
&& [ "$cd_lane_sentinel" = "missing" ] \
&& [ "$cd_lane_binary_elf" = "0" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
cd_lane_ok=1
cd_lane_mode=failclosed
elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then
cd_lane_ok=1
cd_lane_mode=controlled_open
fi
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true)
cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true)
cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true)
cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true)
cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true)
cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true)
cd_lane_drain_limits_ok=0
if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \
&& [ "$cd_lane_drain_memory_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \
&& [ "$cd_lane_drain_tasks_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then
cd_lane_drain_limits_ok=1
fi
cd_lane_drain_capacity_ok=0
cd_lane_drain_labels_ok=0
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
cd_lane_drain_capacity_ok=1
fi
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
cd_lane_drain_labels_ok=1
fi
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
cd_lane_drain_binary_elf=0
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_drain_ok=0
cd_lane_drain_mode=blocked
if [ "$cd_lane_drain_active" != "active" ] \
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
&& [ "$cd_lane_drain_process_count" = "0" ] \
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
cd_lane_drain_ok=1
cd_lane_drain_mode=failclosed
elif [ "$cd_lane_drain_active" = "active" ] \
&& [ "$cd_lane_drain_capacity_ok" = "1" ] \
&& [ "$cd_lane_drain_labels_ok" = "1" ] \
&& [ "$cd_lane_drain_binary_elf" = "1" ] \
&& [ "$cd_lane_drain_limits_ok" = "1" ]; then
cd_lane_drain_ok=1
cd_lane_drain_mode=controlled_open
fi
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
cd_lane_root_restore_left=unknown
if sudo -n true >/dev/null 2>&1; then
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-runner-restore-sources-disabled*" -o -name "awoooi-cd-lane-disabled*" -o -name "awoooi-cd-lane-drain-disabled*" \) -print 2>/dev/null | wc -l | tr -d " ")
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ")
fi
echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left"
if [ "$cd_lane_root_restore_left" = "0" ]; then
:
else
bad=1
cd_lane_guard_ok=0
if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then
cd_lane_guard_ok=1
fi
sentinel_left=0
for s in /run/awoooi-runner-host-enabled /run/awoooi-start-controlled-cd-lane /run/awoooi-start-controlled-cd-lane-drain /run/awoooi-start-cd-lane-allowed /run/awoooi-cd-lane-drain-ok /run/awoooi-cd-lane-ok /run/awoooi-cd-lane-enabled /run/awoooi-cd-lane-controlled-open; do
[ -e "$s" ] && sentinel_left=$((sentinel_left + 1))
done
echo "RUNNER_SENTINELS_LEFT $sentinel_left"
[ "$sentinel_left" = "0" ] || bad=1
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
[ "$cd_lane_guard_ok" = "1" ] || bad=1
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
[ "$direct_runner_count" = "0" ] || bad=1
job_count=$(docker ps --format "{{.Names}}" 2>/dev/null | grep -Ec "^(GITEA-ACTIONS-|awoooi-cd-)" || true)
echo "ACTIVE_JOB_CONTAINERS $job_count"
[ "$job_count" = "0" ] || bad=1
for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && bad=1
done
cd_lane_guard_ok=0
[ "$bad" = "0" ] && cd_lane_guard_ok=1
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
load=$(systemctl show "$u" -p LoadState --value)
unitfile=$(systemctl show "$u" -p UnitFileState --value)
mainpid=$(systemctl show "$u" -p MainPID --value)
watchdog=$(systemctl show "$u" -p WatchdogUSec --value)
quota=$(systemctl show "$u" -p CPUQuotaPerSecUSec --value)
memory=$(systemctl show "$u" -p MemoryMax --value)
state=$(systemctl show "$u" -p ActiveState --value)
unitfile=$(systemctl show "$u" -p UnitFileState --value)
echo "$u watchdog=$watchdog quota=$quota memory=$memory state=$state unitfile=$unitfile"
if [ "$state" = "active" ] || [ "$state" = "activating" ]; then
[ "$watchdog" = "0" ] || bad=1
[ "$quota" != "infinity" ] && [ "$quota" != "0" ] || bad=1
[ "$memory" != "infinity" ] && [ "$memory" != "0" ] || bad=1
elif [ "$unitfile" = "masked" ] || [ "$state" = "inactive" ]; then
:
else
bad=1
action_ok=0
action_mode=blocked
if [ "$state" != "active" ] \
&& { [ "$load" = "masked" ] || [ "$load" = "not-found" ] || [ "$unitfile" = "masked" ] || [ "$unitfile" = "disabled" ]; } \
&& [ "${mainpid:-0}" = "0" ]; then
action_ok=1
action_mode=github_disabled
fi
echo "$u mode=$action_mode load=$load unitfile=$unitfile state=$state mainpid=$mainpid watchdog=$watchdog quota=$quota memory=$memory ok=$action_ok"
[ "$action_ok" = "1" ] || bad=1
done
echo "BAD_RUNNER_GUARDRAILS $bad"
' 2>&1); then
@@ -390,7 +445,7 @@ echo "BAD_RUNNER_GUARDRAILS $bad"
return
fi
echo "$out"
grep -q "BAD_RUNNER_GUARDRAILS 0" <<<"$out" && ok "110 runner/CD lane fail-closed enforcer and guardrails complete" || blocked "110 runner/CD lane fail-closed guardrails incomplete"
grep -q "BAD_RUNNER_GUARDRAILS 0" <<<"$out" && ok "legacy runner fail-closed and controlled cd-lane guardrails complete" || blocked "legacy runner / controlled cd-lane guardrails incomplete"
}
check_job_containers() {

View File

@@ -538,61 +538,112 @@ fi
section "110 runner fail-closed guard"
runner_tmp="$(mktemp -t post-start-runner.XXXXXX)"
if ssh_read "wooo@192.168.0.110" '
for u in awoooi-cd-lane.service awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
unit_ok=0
unit_stub=0
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ] && [ "${mainpid:-0}" = "0" ]; then
unit_ok=1
elif [ "$active" = "inactive" ] && [ "${mainpid:-0}" = "0" ] \
&& systemctl cat "$u" 2>/dev/null | grep -q "ConditionPathExists=/run/awoooi-runner-migrated-or-hard-limited"; then
unit_stub=1
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
unit_ok=1
fi
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid stub=$unit_stub ok=$unit_ok"
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
done
enforcer_timer_active=$(systemctl is-active awoooi-runner-failclosed-enforcer.timer 2>/dev/null || true)
enforcer_timer_enabled=$(systemctl is-enabled awoooi-runner-failclosed-enforcer.timer 2>/dev/null || true)
enforcer_service_result=$(systemctl show awoooi-runner-failclosed-enforcer.service -p Result --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_ENFORCER timer_active=$enforcer_timer_active timer_enabled=$enforcer_timer_enabled service_result=$enforcer_service_result"
authority_timer_active=$(systemctl is-active awoooi-runner-failclosed-authority.timer 2>/dev/null || true)
authority_timer_enabled=$(systemctl is-enabled awoooi-runner-failclosed-authority.timer 2>/dev/null || true)
authority_service_result=$(systemctl show awoooi-runner-failclosed-authority.service -p Result --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_AUTHORITY timer_active=$authority_timer_active timer_enabled=$authority_timer_enabled service_result=$authority_service_result"
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
echo "CD_LANE_PROCESS_COUNT $cd_lane_process_count"
cd_lane_load=$(systemctl show awoooi-cd-lane.service -p LoadState --value 2>/dev/null || true)
cd_lane_unitfile=$(systemctl show awoooi-cd-lane.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_active=$(systemctl show awoooi-cd-lane.service -p ActiveState --value 2>/dev/null || true)
cd_lane_mainpid=$(systemctl show awoooi-cd-lane.service -p MainPID --value 2>/dev/null || true)
cd_lane_execstart=$(systemctl show awoooi-cd-lane.service -p ExecStart --value 2>/dev/null || true)
cd_lane_sentinel=missing
[ -e /run/awoooi-cd-lane-enabled ] && cd_lane_sentinel=present
cd_lane_capacity_ok=0
cd_lane_labels_ok=0
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
cd_lane_capacity_ok=1
fi
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
cd_lane_labels_ok=1
fi
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
cd_lane_binary_elf=0
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane" 2>/dev/null | wc -l | tr -d " ")
cd_lane_ok=0
cd_lane_mode=blocked
if [ "$cd_lane_active" = "inactive" ] \
&& [ "$cd_lane_sentinel" = "missing" ] \
&& [ "$cd_lane_binary_elf" = "0" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
cd_lane_ok=1
cd_lane_mode=failclosed
fi
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true)
cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true)
cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true)
cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true)
cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true)
cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true)
cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true)
cd_lane_drain_limits_ok=0
if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \
&& [ "$cd_lane_drain_memory_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \
&& [ "$cd_lane_drain_tasks_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then
cd_lane_drain_limits_ok=1
fi
cd_lane_drain_capacity_ok=0
cd_lane_drain_labels_ok=0
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
cd_lane_drain_capacity_ok=1
fi
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
cd_lane_drain_labels_ok=1
fi
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
cd_lane_drain_binary_elf=0
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_drain_ok=0
cd_lane_drain_mode=blocked
if [ "$cd_lane_drain_active" != "active" ] \
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
&& [ "$cd_lane_drain_process_count" = "0" ] \
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
cd_lane_drain_ok=1
cd_lane_drain_mode=failclosed
elif [ "$cd_lane_drain_active" = "active" ] \
&& [ "$cd_lane_drain_capacity_ok" = "1" ] \
&& [ "$cd_lane_drain_labels_ok" = "1" ] \
&& [ "$cd_lane_drain_binary_elf" = "1" ] \
&& [ "$cd_lane_drain_limits_ok" = "1" ]; then
cd_lane_drain_ok=1
cd_lane_drain_mode=controlled_open
fi
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
cd_lane_root_restore_left=unknown
if sudo -n true >/dev/null 2>&1; then
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-runner-restore-sources-disabled*" -o -name "awoooi-cd-lane-disabled*" -o -name "awoooi-cd-lane-drain-disabled*" \) -print 2>/dev/null | wc -l | tr -d " ")
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ")
fi
echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left"
sentinel_left=0
for s in /run/awoooi-runner-host-enabled /run/awoooi-start-controlled-cd-lane /run/awoooi-start-controlled-cd-lane-drain /run/awoooi-start-cd-lane-allowed /run/awoooi-cd-lane-drain-ok /run/awoooi-cd-lane-ok /run/awoooi-cd-lane-enabled /run/awoooi-cd-lane-controlled-open; do
[ -e "$s" ] && sentinel_left=$((sentinel_left + 1))
done
echo "RUNNER_SENTINELS_LEFT $sentinel_left"
active_job_containers=$(docker ps --format "{{.Names}}" 2>/dev/null | grep -Ec "^(GITEA-ACTIONS-|awoooi-cd-)" || true)
echo "ACTIVE_JOB_CONTAINERS $active_job_containers"
cd_lane_guard_ok=0
if [ "$enforcer_timer_active" = "active" ] \
&& [ "$enforcer_timer_enabled" = "enabled" ] \
&& [ "$enforcer_service_result" = "success" ] \
&& [ "$authority_timer_active" = "active" ] \
&& [ "$authority_timer_enabled" = "enabled" ] \
&& [ "$authority_service_result" = "success" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& [ "$cd_lane_root_restore_left" = "0" ] \
&& [ "$sentinel_left" = "0" ] \
&& [ "$active_job_containers" = "0" ]; then
if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then
cd_lane_guard_ok=1
fi
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
@@ -606,15 +657,12 @@ else
fi
cat "$runner_tmp"
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' "$runner_tmp"; then
ok "110 runner/CD lane units are fail-closed"
ok "110 legacy direct/Gitea runner units are fail-closed"
else
blocked "110 runner/CD lane units are not fail-closed"
blocked "110 legacy direct/Gitea runner units are not fail-closed"
fi
grep -q "RUNNER_FAILCLOSED_ENFORCER timer_active=active timer_enabled=enabled service_result=success" "$runner_tmp" && ok "110 fail-closed enforcer timer active and successful" || blocked "110 fail-closed enforcer timer not healthy"
grep -q "RUNNER_FAILCLOSED_AUTHORITY timer_active=active timer_enabled=enabled service_result=success" "$runner_tmp" && ok "110 fail-closed authority timer active and successful" || blocked "110 fail-closed authority timer not healthy"
grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 cd-lane/drain lane are fail-closed with enforcer" || blocked "110 cd-lane/drain lane fail-closed guardrails incomplete"
grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || blocked "110 controlled cd-lane guardrails incomplete"
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || blocked "110 legacy direct runner process detected"
grep -q "ACTIVE_JOB_CONTAINERS 0" "$runner_tmp" && ok "110 Gitea/CD job container count is zero" || blocked "110 Gitea/CD job container still active"
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking"
rm -f "$runner_tmp"