fix(awooop): cooldown ansible check-mode transport blockers

2026-05-31 13:08:39 +08:00
parent 46cc56c3ce
commit e45e52e526
5 changed files with 143 additions and 1 deletions
--- a/apps/api/src/core/config.py
+++ b/apps/api/src/core/config.py
@@ -639,6 +639,15 @@ class Settings(BaseSettings):
        le=900,
        description="Delay before the check-mode worker first tick after API startup.",
    )
+    AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS: int = Field(
+        default=21_600,
+        ge=300,
+        le=86_400,
+        description=(
+            "Cooldown after transport-level check-mode blockers such as "
+            "forced-command repair SSH denial."
+        ),
+    )

    # ==========================================================================
    # 統帥鐵律：禁止 SQLite (AWOOOI 憲法)
--- a/apps/api/src/services/awooop_ansible_check_mode_service.py
+++ b/apps/api/src/services/awooop_ansible_check_mode_service.py
@@ -31,6 +31,7 @@ _SAFE_HOST_RE = re.compile(r"^[A-Za-z0-9_.-]+$")
 _PLAYBOOK_PREFIX = Path("infra/ansible/playbooks")
 _STDOUT_LIMIT = 20_000
 _STDERR_LIMIT = 12_000
+FORCED_COMMAND_BLOCKER = "ansible_repair_ssh_forced_command_denies_ansible_bootstrap"


@dataclass(frozen=True)
@@ -80,6 +81,14 @@ def _json_loads(value: Any) -> dict[str, Any]:
    return {}


+def detect_ansible_transport_blockers(*values: Any) -> list[str]:
+    combined = " ".join(str(value or "") for value in values)
+    blockers: list[str] = []
+    if "REPAIR_DENIED:invalid_command" in combined:
+        blockers.append(FORCED_COMMAND_BLOCKER)
+    return blockers
+
+
 def _playbook_roots(module_path: Path | None = None) -> list[Path]:
    resolved_module_path = (module_path or Path(__file__)).resolve()
    return [
@@ -387,6 +396,44 @@ async def claim_pending_check_modes(
    return claims


+async def recent_ansible_transport_blockers(
+    *,
+    project_id: str = "awoooi",
+    cooldown_seconds: int | None = None,
+) -> list[str]:
+    """Return transport blockers observed from recent failed check-mode rows."""
+
+    cooldown = cooldown_seconds or settings.AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS
+    async with get_db_context(project_id) as db:
+        result = await db.execute(
+            text("""
+                SELECT
+                    coalesce(output::text, '') AS output_text,
+                    coalesce(dry_run_result::text, '') AS dry_run_text,
+                    coalesce(error, '') AS error_text,
+                    coalesce(stderr_feed_back, '') AS stderr_text
+                FROM automation_operation_log
+                WHERE operation_type = 'ansible_check_mode_executed'
+                  AND status = 'failed'
+                  AND created_at >= NOW() - CAST(:cooldown AS interval)
+                ORDER BY created_at DESC
+                LIMIT 20
+            """),
+            {"cooldown": f"{max(60, cooldown)} seconds"},
+        )
+        blockers: set[str] = set()
+        for row in result.mappings().all():
+            blockers.update(
+                detect_ansible_transport_blockers(
+                    row.get("output_text"),
+                    row.get("dry_run_text"),
+                    row.get("error_text"),
+                    row.get("stderr_text"),
+                )
+            )
+    return sorted(blockers)
+
+
 async def _insert_skipped_candidate(
    db: Any,
    *,
@@ -512,6 +559,10 @@ async def run_pending_check_modes_once(
    if blockers:
        logger.warning("ansible_check_mode_runtime_blocked", blockers=blockers)
        return {"claimed": 0, "completed": 0, "failed": 0, "blockers": blockers}
+    transport_blockers = await recent_ansible_transport_blockers(project_id=project_id)
+    if transport_blockers:
+        logger.warning("ansible_check_mode_transport_blocked", blockers=transport_blockers)
+        return {"claimed": 0, "completed": 0, "failed": 0, "blockers": transport_blockers}

    claims = await claim_pending_check_modes(project_id=project_id, limit=limit)
    completed = 0
--- a/apps/api/src/services/awooop_truth_chain_service.py
+++ b/apps/api/src/services/awooop_truth_chain_service.py
@@ -21,6 +21,7 @@ import structlog
 from sqlalchemy import text

 from src.db.base import get_db_context
+from src.services.awooop_ansible_check_mode_service import detect_ansible_transport_blockers
 from src.services.awooop_ansible_audit_service import build_ansible_truth
 from src.services.drift_repeat_state import build_drift_repeat_state

@@ -722,6 +723,28 @@ def _execution_backend_summary(records: list[dict[str, Any]]) -> dict[str, Any]:
    return summary


+def _ansible_observed_runtime_blockers(records: list[dict[str, Any]]) -> list[str]:
+    blockers: set[str] = set()
+    for record in records:
+        execution = record.get("execution") if isinstance(record.get("execution"), dict) else {}
+        ansible = execution.get("ansible") if isinstance(execution.get("ansible"), dict) else {}
+        ansible_records = ansible.get("records") if isinstance(ansible.get("records"), list) else []
+        for row in ansible_records:
+            if not isinstance(row, dict):
+                continue
+            if str(row.get("operation_type") or "") != "ansible_check_mode_executed":
+                continue
+            dry_run_result = row.get("dry_run_result") if isinstance(row.get("dry_run_result"), dict) else {}
+            blockers.update(
+                detect_ansible_transport_blockers(
+                    row.get("error"),
+                    dry_run_result.get("stdout_tail"),
+                    dry_run_result.get("stderr_tail"),
+                )
+            )
+    return sorted(blockers)
+
+
 def _ansible_playbook_roots(module_path: Path | None = None) -> list[Path]:
    resolved_module_path = (module_path or Path(__file__)).resolve()
    return [
@@ -875,6 +898,15 @@ def summarize_automation_quality_records(
        key=lambda row: (-int(row["total"]), str(row["gate"])),
    )

+    ansible_runtime = _ansible_runtime_readiness()
+    observed_ansible_blockers = _ansible_observed_runtime_blockers(records)
+    if observed_ansible_blockers:
+        ansible_runtime["observed_transport_blockers"] = observed_ansible_blockers
+        ansible_runtime["blockers"] = sorted(
+            set(ansible_runtime.get("blockers") or []) | set(observed_ansible_blockers)
+        )
+        ansible_runtime["can_run_check_mode"] = False
+
    return {
        "schema_version": "automation_quality_summary_v1",
        "project_id": project_id,
@@ -888,7 +920,7 @@ def summarize_automation_quality_records(
        "by_verdict": by_verdict,
        "gate_failures": failing_gates,
        "execution_backend_summary": _execution_backend_summary(records),
-        "ansible_runtime": _ansible_runtime_readiness(),
+        "ansible_runtime": ansible_runtime,
        "examples": examples[:25],
        "production_claim": {
            "can_claim_full_auto_repair": evaluated_total > 0 and verified_total == evaluated_total,
--- a/apps/api/tests/test_awooop_truth_chain_service.py
+++ b/apps/api/tests/test_awooop_truth_chain_service.py
@@ -12,6 +12,7 @@ from src.services.awooop_ansible_audit_service import (
 from src.services.awooop_ansible_check_mode_service import (
    build_ansible_check_mode_claim_input,
    build_ansible_check_mode_command,
+    detect_ansible_transport_blockers,
 )
 from src.services.awooop_truth_chain_service import (
    _ansible_playbook_roots,
@@ -976,6 +977,14 @@ def test_ansible_check_mode_command_uses_check_diff_and_repair_ssh(tmp_path: Pat
    assert "apply" not in " ".join(spec.command)


+def test_ansible_transport_blocker_detects_repair_forced_command_denial() -> None:
+    blockers = detect_ansible_transport_blockers(
+        "fatal: host unreachable REPAIR_DENIED:invalid_command",
+    )
+
+    assert blockers == ["ansible_repair_ssh_forced_command_denies_ansible_bootstrap"]
+
+
 def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> None:
    summary = _execution_backend_summary([
        {
@@ -1006,3 +1015,42 @@ def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> No

    assert summary["ansible_check_mode_total"] == 1
    assert summary["ansible_pending_check_mode_total"] == 0
+
+
+def test_quality_summary_marks_forced_command_denial_as_runtime_blocker() -> None:
+    summary = summarize_automation_quality_records(
+        project_id="awoooi",
+        window_hours=24,
+        limit=20,
+        records=[
+            {
+                "incident": {"incident_id": "INC-1", "alertname": "DockerContainerUnhealthy"},
+                "truth_status": {},
+                "automation_quality": {"applicable": True, "score": 50, "verdict": "observed"},
+                "execution": {
+                    "automation_operation_log": [],
+                    "auto_repair_executions": [],
+                    "ansible": {
+                        "considered": True,
+                        "candidate_catalog": {"candidates": [{"catalog_id": "ansible:110-devops"}]},
+                        "records": [
+                            {
+                                "op_id": "check-1",
+                                "operation_type": "ansible_check_mode_executed",
+                                "status": "failed",
+                                "dry_run_result": {
+                                    "stdout_tail": "REPAIR_DENIED:invalid_command",
+                                },
+                            }
+                        ],
+                    },
+                },
+            }
+        ],
+    )
+
+    assert summary["ansible_runtime"]["can_run_check_mode"] is False
+    assert (
+        "ansible_repair_ssh_forced_command_denies_ansible_bootstrap"
+        in summary["ansible_runtime"]["blockers"]
+    )
--- a/k8s/awoooi-prod/06-deployment-api.yaml
+++ b/k8s/awoooi-prod/06-deployment-api.yaml
@@ -111,6 +111,8 @@ spec:
              value: "180"
            - name: AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS
              value: "120"
+            - name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS
+              value: "21600"
          # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
          volumeMounts:
            - name: repair-ssh-key