From e45e52e526e9fd97a1e2ff3051c897578545a1a3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 31 May 2026 13:08:39 +0800 Subject: [PATCH] fix(awooop): cooldown ansible check-mode transport blockers --- apps/api/src/core/config.py | 9 ++++ .../awooop_ansible_check_mode_service.py | 51 +++++++++++++++++++ .../services/awooop_truth_chain_service.py | 34 ++++++++++++- .../tests/test_awooop_truth_chain_service.py | 48 +++++++++++++++++ k8s/awoooi-prod/06-deployment-api.yaml | 2 + 5 files changed, 143 insertions(+), 1 deletion(-) diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index 65628563..50345ef9 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -639,6 +639,15 @@ class Settings(BaseSettings): le=900, description="Delay before the check-mode worker first tick after API startup.", ) + AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS: int = Field( + default=21_600, + ge=300, + le=86_400, + description=( + "Cooldown after transport-level check-mode blockers such as " + "forced-command repair SSH denial." + ), + ) # ========================================================================== # 統帥鐵律:禁止 SQLite (AWOOOI 憲法) diff --git a/apps/api/src/services/awooop_ansible_check_mode_service.py b/apps/api/src/services/awooop_ansible_check_mode_service.py index 0ac0b772..a2266c3b 100644 --- a/apps/api/src/services/awooop_ansible_check_mode_service.py +++ b/apps/api/src/services/awooop_ansible_check_mode_service.py @@ -31,6 +31,7 @@ _SAFE_HOST_RE = re.compile(r"^[A-Za-z0-9_.-]+$") _PLAYBOOK_PREFIX = Path("infra/ansible/playbooks") _STDOUT_LIMIT = 20_000 _STDERR_LIMIT = 12_000 +FORCED_COMMAND_BLOCKER = "ansible_repair_ssh_forced_command_denies_ansible_bootstrap" @dataclass(frozen=True) @@ -80,6 +81,14 @@ def _json_loads(value: Any) -> dict[str, Any]: return {} +def detect_ansible_transport_blockers(*values: Any) -> list[str]: + combined = " ".join(str(value or "") for value in values) + blockers: list[str] = [] + if "REPAIR_DENIED:invalid_command" in combined: + blockers.append(FORCED_COMMAND_BLOCKER) + return blockers + + def _playbook_roots(module_path: Path | None = None) -> list[Path]: resolved_module_path = (module_path or Path(__file__)).resolve() return [ @@ -387,6 +396,44 @@ async def claim_pending_check_modes( return claims +async def recent_ansible_transport_blockers( + *, + project_id: str = "awoooi", + cooldown_seconds: int | None = None, +) -> list[str]: + """Return transport blockers observed from recent failed check-mode rows.""" + + cooldown = cooldown_seconds or settings.AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS + async with get_db_context(project_id) as db: + result = await db.execute( + text(""" + SELECT + coalesce(output::text, '') AS output_text, + coalesce(dry_run_result::text, '') AS dry_run_text, + coalesce(error, '') AS error_text, + coalesce(stderr_feed_back, '') AS stderr_text + FROM automation_operation_log + WHERE operation_type = 'ansible_check_mode_executed' + AND status = 'failed' + AND created_at >= NOW() - CAST(:cooldown AS interval) + ORDER BY created_at DESC + LIMIT 20 + """), + {"cooldown": f"{max(60, cooldown)} seconds"}, + ) + blockers: set[str] = set() + for row in result.mappings().all(): + blockers.update( + detect_ansible_transport_blockers( + row.get("output_text"), + row.get("dry_run_text"), + row.get("error_text"), + row.get("stderr_text"), + ) + ) + return sorted(blockers) + + async def _insert_skipped_candidate( db: Any, *, @@ -512,6 +559,10 @@ async def run_pending_check_modes_once( if blockers: logger.warning("ansible_check_mode_runtime_blocked", blockers=blockers) return {"claimed": 0, "completed": 0, "failed": 0, "blockers": blockers} + transport_blockers = await recent_ansible_transport_blockers(project_id=project_id) + if transport_blockers: + logger.warning("ansible_check_mode_transport_blocked", blockers=transport_blockers) + return {"claimed": 0, "completed": 0, "failed": 0, "blockers": transport_blockers} claims = await claim_pending_check_modes(project_id=project_id, limit=limit) completed = 0 diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index 33144c4d..2940832f 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -21,6 +21,7 @@ import structlog from sqlalchemy import text from src.db.base import get_db_context +from src.services.awooop_ansible_check_mode_service import detect_ansible_transport_blockers from src.services.awooop_ansible_audit_service import build_ansible_truth from src.services.drift_repeat_state import build_drift_repeat_state @@ -722,6 +723,28 @@ def _execution_backend_summary(records: list[dict[str, Any]]) -> dict[str, Any]: return summary +def _ansible_observed_runtime_blockers(records: list[dict[str, Any]]) -> list[str]: + blockers: set[str] = set() + for record in records: + execution = record.get("execution") if isinstance(record.get("execution"), dict) else {} + ansible = execution.get("ansible") if isinstance(execution.get("ansible"), dict) else {} + ansible_records = ansible.get("records") if isinstance(ansible.get("records"), list) else [] + for row in ansible_records: + if not isinstance(row, dict): + continue + if str(row.get("operation_type") or "") != "ansible_check_mode_executed": + continue + dry_run_result = row.get("dry_run_result") if isinstance(row.get("dry_run_result"), dict) else {} + blockers.update( + detect_ansible_transport_blockers( + row.get("error"), + dry_run_result.get("stdout_tail"), + dry_run_result.get("stderr_tail"), + ) + ) + return sorted(blockers) + + def _ansible_playbook_roots(module_path: Path | None = None) -> list[Path]: resolved_module_path = (module_path or Path(__file__)).resolve() return [ @@ -875,6 +898,15 @@ def summarize_automation_quality_records( key=lambda row: (-int(row["total"]), str(row["gate"])), ) + ansible_runtime = _ansible_runtime_readiness() + observed_ansible_blockers = _ansible_observed_runtime_blockers(records) + if observed_ansible_blockers: + ansible_runtime["observed_transport_blockers"] = observed_ansible_blockers + ansible_runtime["blockers"] = sorted( + set(ansible_runtime.get("blockers") or []) | set(observed_ansible_blockers) + ) + ansible_runtime["can_run_check_mode"] = False + return { "schema_version": "automation_quality_summary_v1", "project_id": project_id, @@ -888,7 +920,7 @@ def summarize_automation_quality_records( "by_verdict": by_verdict, "gate_failures": failing_gates, "execution_backend_summary": _execution_backend_summary(records), - "ansible_runtime": _ansible_runtime_readiness(), + "ansible_runtime": ansible_runtime, "examples": examples[:25], "production_claim": { "can_claim_full_auto_repair": evaluated_total > 0 and verified_total == evaluated_total, diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index 0d2d59c9..c4a39599 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -12,6 +12,7 @@ from src.services.awooop_ansible_audit_service import ( from src.services.awooop_ansible_check_mode_service import ( build_ansible_check_mode_claim_input, build_ansible_check_mode_command, + detect_ansible_transport_blockers, ) from src.services.awooop_truth_chain_service import ( _ansible_playbook_roots, @@ -976,6 +977,14 @@ def test_ansible_check_mode_command_uses_check_diff_and_repair_ssh(tmp_path: Pat assert "apply" not in " ".join(spec.command) +def test_ansible_transport_blocker_detects_repair_forced_command_denial() -> None: + blockers = detect_ansible_transport_blockers( + "fatal: host unreachable REPAIR_DENIED:invalid_command", + ) + + assert blockers == ["ansible_repair_ssh_forced_command_denies_ansible_bootstrap"] + + def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> None: summary = _execution_backend_summary([ { @@ -1006,3 +1015,42 @@ def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> No assert summary["ansible_check_mode_total"] == 1 assert summary["ansible_pending_check_mode_total"] == 0 + + +def test_quality_summary_marks_forced_command_denial_as_runtime_blocker() -> None: + summary = summarize_automation_quality_records( + project_id="awoooi", + window_hours=24, + limit=20, + records=[ + { + "incident": {"incident_id": "INC-1", "alertname": "DockerContainerUnhealthy"}, + "truth_status": {}, + "automation_quality": {"applicable": True, "score": 50, "verdict": "observed"}, + "execution": { + "automation_operation_log": [], + "auto_repair_executions": [], + "ansible": { + "considered": True, + "candidate_catalog": {"candidates": [{"catalog_id": "ansible:110-devops"}]}, + "records": [ + { + "op_id": "check-1", + "operation_type": "ansible_check_mode_executed", + "status": "failed", + "dry_run_result": { + "stdout_tail": "REPAIR_DENIED:invalid_command", + }, + } + ], + }, + }, + } + ], + ) + + assert summary["ansible_runtime"]["can_run_check_mode"] is False + assert ( + "ansible_repair_ssh_forced_command_denies_ansible_bootstrap" + in summary["ansible_runtime"]["blockers"] + ) diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 52722c08..38d73273 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -111,6 +111,8 @@ spec: value: "180" - name: AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS value: "120" + - name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS + value: "21600" # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用 volumeMounts: - name: repair-ssh-key