diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index 08aec070..65628563 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -609,6 +609,36 @@ class Settings(BaseSettings): "(X-AwoooP-Operator-Key header)" ), ) + ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER: bool = Field( + default=False, + description=( + "True=consume ansible_candidate_matched AOL rows and run " + "ansible-playbook --check --diff only. Apply remains disabled." + ), + ) + AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS: int = Field( + default=300, + ge=60, + description="AwoooP Ansible check-mode worker polling interval.", + ) + AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT: int = Field( + default=1, + ge=1, + le=5, + description="Maximum Ansible check-mode candidates claimed per worker tick.", + ) + AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS: int = Field( + default=180, + ge=30, + le=600, + description="Timeout for one ansible-playbook --check --diff execution.", + ) + AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS: int = Field( + default=120, + ge=0, + le=900, + description="Delay before the check-mode worker first tick after API startup.", + ) # ========================================================================== # 統帥鐵律:禁止 SQLite (AWOOOI 憲法) diff --git a/apps/api/src/jobs/awooop_ansible_check_mode_job.py b/apps/api/src/jobs/awooop_ansible_check_mode_job.py new file mode 100644 index 00000000..6a580698 --- /dev/null +++ b/apps/api/src/jobs/awooop_ansible_check_mode_job.py @@ -0,0 +1,44 @@ +"""AwoooP Ansible check-mode worker loop. + +Runs only when explicitly enabled by settings. The worker consumes pending +``ansible_candidate_matched`` rows and records check-mode evidence; it never +executes Ansible apply. +""" + +from __future__ import annotations + +import asyncio + +import structlog + +from src.core.config import settings +from src.services.awooop_ansible_check_mode_service import run_pending_check_modes_once + +logger = structlog.get_logger(__name__) + + +async def run_awooop_ansible_check_mode_loop() -> None: + if not settings.ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER: + logger.info("awooop_ansible_check_mode_worker_disabled") + return + + logger.info( + "awooop_ansible_check_mode_worker_started", + interval_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS, + batch_limit=settings.AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT, + timeout_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS, + ) + await asyncio.sleep(settings.AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS) + + while True: + try: + result = await run_pending_check_modes_once( + limit=settings.AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT, + timeout_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS, + ) + if result.get("claimed") or result.get("blockers"): + logger.info("awooop_ansible_check_mode_worker_tick", **result) + except Exception as exc: + logger.warning("awooop_ansible_check_mode_worker_failed", error=str(exc)) + + await asyncio.sleep(settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS) diff --git a/apps/api/src/main.py b/apps/api/src/main.py index ca562d58..1044d071 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -521,6 +521,22 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("incident_lifecycle_reconciler_schedule_failed", error=str(e)) + # AwoooP Ansible check-mode worker. + # 只執行 ansible-playbook --check --diff 並回寫 automation_operation_log; + # apply 仍必須走 approval gate,本 worker 不寫 auto_repair_executions。 + try: + from src.jobs.awooop_ansible_check_mode_job import ( + run_awooop_ansible_check_mode_loop, + ) + asyncio.create_task(run_awooop_ansible_check_mode_loop()) + logger.info( + "awooop_ansible_check_mode_worker_scheduled", + enabled=settings.ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER, + interval_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS, + ) + except Exception as e: + logger.warning("awooop_ansible_check_mode_worker_schedule_failed", error=str(e)) + # ADR-083 Phase 3: Evolver Agent(每日)— Playbook 自動合併 + 低信任封存 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立 try: diff --git a/apps/api/src/services/awooop_ansible_audit_service.py b/apps/api/src/services/awooop_ansible_audit_service.py index 40e629da..a73d66ff 100644 --- a/apps/api/src/services/awooop_ansible_audit_service.py +++ b/apps/api/src/services/awooop_ansible_audit_service.py @@ -105,6 +105,15 @@ _CATALOG: tuple[dict[str, Any], ...] = ( ) +def get_ansible_catalog_item(catalog_id: str) -> dict[str, Any] | None: + """Return one repo-known Ansible catalog item without exposing mutability.""" + + for item in _CATALOG: + if item["catalog_id"] == catalog_id: + return dict(item) + return None + + def _get(row: dict[str, Any], key: str) -> Any: return row.get(key) @@ -156,6 +165,7 @@ def _is_ansible_operation(row: dict[str, Any]) -> bool: def _ansible_record(row: dict[str, Any]) -> dict[str, Any]: return { "op_id": _get(row, "op_id"), + "parent_op_id": _get(row, "parent_op_id"), "operation_type": _get(row, "operation_type"), "status": _get(row, "status"), "actor": _get(row, "actor"), @@ -331,6 +341,9 @@ def build_ansible_decision_audit_payload( "catalog_id": row["catalog_id"], "playbook_path": row["playbook_path"], "inventory_hosts": row["inventory_hosts"], + "supports_check_mode": row["supports_check_mode"], + "auto_apply_enabled": row["auto_apply_enabled"], + "approval_required": row["approval_required"], "risk_level": row["risk_level"], "match_score": row["match_score"], "matched_keywords": row["matched_keywords"], diff --git a/apps/api/src/services/awooop_ansible_check_mode_service.py b/apps/api/src/services/awooop_ansible_check_mode_service.py new file mode 100644 index 00000000..0ac0b772 --- /dev/null +++ b/apps/api/src/services/awooop_ansible_check_mode_service.py @@ -0,0 +1,533 @@ +"""Safe Ansible check-mode executor for AwoooP truth-chain evidence. + +This service is deliberately dry-run only. It claims pending +``ansible_candidate_matched`` AOL rows, runs ``ansible-playbook --check --diff``, +and writes the result back as ``ansible_check_mode_executed``. It never enables +apply and never writes auto_repair_executions. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +import shutil +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import structlog +from sqlalchemy import text + +from src.core.config import settings +from src.db.base import get_db_context +from src.services.awooop_ansible_audit_service import get_ansible_catalog_item + +logger = structlog.get_logger(__name__) + +_SAFE_HOST_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +_PLAYBOOK_PREFIX = Path("infra/ansible/playbooks") +_STDOUT_LIMIT = 20_000 +_STDERR_LIMIT = 12_000 + + +@dataclass(frozen=True) +class AnsibleCheckModeClaim: + op_id: str + source_candidate_op_id: str + incident_id: str + catalog_id: str + playbook_path: str + inventory_hosts: tuple[str, ...] + input_payload: dict[str, Any] + + +@dataclass(frozen=True) +class AnsibleCommandSpec: + command: list[str] + cwd: Path + env: dict[str, str] + playbook_abs_path: Path + inventory_abs_path: Path + + +@dataclass(frozen=True) +class AnsibleRunResult: + returncode: int + stdout: str + stderr: str + duration_ms: int + timed_out: bool = False + + +def _tail(text_value: str, limit: int) -> str: + if len(text_value) <= limit: + return text_value + return text_value[-limit:] + + +def _json_loads(value: Any) -> dict[str, Any]: + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return {} + return parsed if isinstance(parsed, dict) else {} + return {} + + +def _playbook_roots(module_path: Path | None = None) -> list[Path]: + resolved_module_path = (module_path or Path(__file__)).resolve() + return [ + Path("/app/infra/ansible"), + Path.cwd() / "infra" / "ansible", + *(parent / "infra" / "ansible" for parent in resolved_module_path.parents), + ] + + +def _runtime_blockers( + *, + playbook_root: Path | None = None, + repair_ssh_key_path: Path = Path("/etc/repair-ssh/id_ed25519"), + repair_known_hosts_path: Path = Path("/etc/repair-known-hosts/known_hosts"), +) -> list[str]: + root = playbook_root or next((path for path in _playbook_roots() if path.exists()), None) + blockers: list[str] = [] + if shutil.which("ansible-playbook") is None: + blockers.append("ansible_playbook_binary_missing") + if root is None: + blockers.append("ansible_playbook_catalog_missing") + elif not (root / "inventory" / "hosts.yml").exists(): + blockers.append("ansible_inventory_missing") + if not repair_ssh_key_path.is_file() or not os.access(repair_ssh_key_path, os.R_OK): + blockers.append("ansible_repair_ssh_key_missing") + if not repair_known_hosts_path.is_file() or not os.access(repair_known_hosts_path, os.R_OK): + blockers.append("ansible_repair_known_hosts_missing") + return blockers + + +def _safe_candidate(input_payload: dict[str, Any]) -> dict[str, Any]: + candidates = input_payload.get("executor_candidates") + if not isinstance(candidates, list) or not candidates: + raise ValueError("missing_executor_candidates") + + for candidate in candidates: + if not isinstance(candidate, dict): + continue + catalog_id = str(candidate.get("catalog_id") or "") + catalog_item = get_ansible_catalog_item(catalog_id) + if not catalog_item: + continue + if catalog_item.get("supports_check_mode") is not True: + continue + if catalog_item.get("auto_apply_enabled") is True: + continue + playbook_path = str(candidate.get("playbook_path") or catalog_item.get("playbook_path") or "") + if playbook_path != str(catalog_item.get("playbook_path") or ""): + continue + inventory_hosts = candidate.get("inventory_hosts") or catalog_item.get("inventory_hosts") or [] + if ( + isinstance(inventory_hosts, list) + and inventory_hosts + and all(isinstance(host, str) and _SAFE_HOST_RE.fullmatch(host) for host in inventory_hosts) + ): + return { + "catalog_id": catalog_id, + "playbook_path": playbook_path, + "inventory_hosts": tuple(inventory_hosts), + "risk_level": str(candidate.get("risk_level") or catalog_item.get("risk_level") or ""), + } + raise ValueError("no_safe_check_mode_candidate") + + +def build_ansible_check_mode_claim_input( + *, + source_candidate_op_id: str, + candidate_input: dict[str, Any], +) -> dict[str, Any]: + safe = _safe_candidate(candidate_input) + incident_id = str(candidate_input.get("incident_id") or "") + return { + "incident_id": incident_id, + "executor": "ansible", + "execution_backend": "ansible", + "execution_mode": "check_mode", + "check_mode": True, + "diff": True, + "apply_enabled": False, + "approval_required_before_apply": True, + "source_candidate_op_id": source_candidate_op_id, + "catalog_id": safe["catalog_id"], + "playbook_path": safe["playbook_path"], + "inventory_hosts": list(safe["inventory_hosts"]), + "risk_level": safe["risk_level"], + } + + +def _resolve_playbook_path(playbook_root: Path, playbook_path: str) -> Path: + relative = Path(playbook_path) + if relative.is_absolute() or not str(relative).startswith(str(_PLAYBOOK_PREFIX)): + raise ValueError("unsafe_playbook_path") + repo_root = playbook_root.parent.parent + resolved = (repo_root / relative).resolve() + allowed_root = (repo_root / _PLAYBOOK_PREFIX).resolve() + if allowed_root not in resolved.parents: + raise ValueError("playbook_outside_catalog") + if resolved.suffix not in {".yml", ".yaml"} or not resolved.exists(): + raise ValueError("playbook_not_found") + return resolved + + +def build_ansible_check_mode_command( + *, + playbook_path: str, + inventory_hosts: tuple[str, ...], + playbook_root: Path | None = None, + repair_ssh_key_path: Path = Path("/etc/repair-ssh/id_ed25519"), + repair_known_hosts_path: Path = Path("/etc/repair-known-hosts/known_hosts"), +) -> AnsibleCommandSpec: + root = playbook_root or next((path for path in _playbook_roots() if path.exists()), None) + if root is None: + raise ValueError("ansible_playbook_catalog_missing") + inventory_path = (root / "inventory" / "hosts.yml").resolve() + if not inventory_path.exists(): + raise ValueError("ansible_inventory_missing") + if not inventory_hosts or not all(_SAFE_HOST_RE.fullmatch(host) for host in inventory_hosts): + raise ValueError("unsafe_inventory_hosts") + + playbook_abs = _resolve_playbook_path(root, playbook_path) + ssh_common_args = ( + f"-o UserKnownHostsFile={repair_known_hosts_path} " + "-o IdentitiesOnly=yes -o BatchMode=yes" + ) + extra_vars = { + "ansible_ssh_private_key_file": str(repair_ssh_key_path), + "ansible_ssh_common_args": ssh_common_args, + } + command = [ + "ansible-playbook", + "-i", + str(inventory_path), + str(playbook_abs), + "--check", + "--diff", + "--limit", + ",".join(inventory_hosts), + "--extra-vars", + json.dumps(extra_vars, ensure_ascii=False, separators=(",", ":")), + ] + env = { + **os.environ, + "ANSIBLE_HOST_KEY_CHECKING": "true", + "ANSIBLE_RETRY_FILES_ENABLED": "false", + } + return AnsibleCommandSpec( + command=command, + cwd=root, + env=env, + playbook_abs_path=playbook_abs, + inventory_abs_path=inventory_path, + ) + + +async def _run_ansible_command(spec: AnsibleCommandSpec, *, timeout_seconds: int) -> AnsibleRunResult: + started = time.monotonic() + process = await asyncio.create_subprocess_exec( + *spec.command, + cwd=str(spec.cwd), + env=spec.env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + timed_out = False + try: + stdout_bytes, stderr_bytes = await asyncio.wait_for( + process.communicate(), + timeout=timeout_seconds, + ) + except TimeoutError: + timed_out = True + process.kill() + stdout_bytes, stderr_bytes = await process.communicate() + duration_ms = int((time.monotonic() - started) * 1000) + return AnsibleRunResult( + returncode=124 if timed_out else int(process.returncode or 0), + stdout=stdout_bytes.decode("utf-8", "replace"), + stderr=stderr_bytes.decode("utf-8", "replace"), + duration_ms=duration_ms, + timed_out=timed_out, + ) + + +def _build_result_payload(result: AnsibleRunResult) -> tuple[str, dict[str, Any], dict[str, Any], str | None]: + status = "success" if result.returncode == 0 else "failed" + stdout_tail = _tail(result.stdout, _STDOUT_LIMIT) + stderr_tail = _tail(result.stderr, _STDERR_LIMIT) + output = { + "executor": "ansible", + "execution_mode": "check_mode", + "check_mode": True, + "apply_enabled": False, + "approval_required_before_apply": True, + "returncode": result.returncode, + "timed_out": result.timed_out, + "stdout_tail": stdout_tail, + "stderr_tail": stderr_tail, + "next_required_step": "approval_required_before_ansible_apply", + } + dry_run_result = { + "check_mode_executed": True, + "apply_executed": False, + "safe_to_apply_without_approval": False, + "returncode": result.returncode, + "timed_out": result.timed_out, + "stdout_tail": stdout_tail, + "stderr_tail": stderr_tail, + } + error = None if result.returncode == 0 else (stderr_tail or f"ansible_check_mode_failed_rc_{result.returncode}") + return status, output, dry_run_result, error + + +async def claim_pending_check_modes( + *, + project_id: str = "awoooi", + limit: int = 1, +) -> list[AnsibleCheckModeClaim]: + """Claim pending Ansible candidates by inserting pending check-mode rows.""" + + claims: list[AnsibleCheckModeClaim] = [] + async with get_db_context(project_id) as db: + result = await db.execute( + text(""" + SELECT + candidate.op_id, + candidate.input + FROM automation_operation_log candidate + WHERE candidate.operation_type = 'ansible_candidate_matched' + AND candidate.status = 'dry_run' + AND candidate.input ->> 'executor' = 'ansible' + AND COALESCE((candidate.dry_run_result ->> 'check_mode_executed')::boolean, false) = false + AND NOT EXISTS ( + SELECT 1 + FROM automation_operation_log existing + WHERE existing.parent_op_id = candidate.op_id + AND existing.operation_type IN ( + 'ansible_check_mode_executed', + 'ansible_execution_skipped' + ) + ) + ORDER BY candidate.created_at ASC + LIMIT :limit + FOR UPDATE SKIP LOCKED + """), + {"limit": max(1, limit)}, + ) + rows = result.mappings().all() + for row in rows: + source_op_id = str(row["op_id"]) + candidate_input = _json_loads(row["input"]) + try: + claim_input = build_ansible_check_mode_claim_input( + source_candidate_op_id=source_op_id, + candidate_input=candidate_input, + ) + except ValueError as exc: + await _insert_skipped_candidate( + db, + source_candidate_op_id=source_op_id, + candidate_input=candidate_input, + reason=str(exc), + ) + continue + inserted = await db.execute( + text(""" + INSERT INTO automation_operation_log ( + operation_type, actor, status, + input, output, dry_run_result, + parent_op_id, tags + ) VALUES ( + 'ansible_check_mode_executed', + 'ansible_check_mode_worker', + 'pending', + CAST(:input AS jsonb), + '{}'::jsonb, + CAST(:dry_run_result AS jsonb), + CAST(:parent_op_id AS uuid), + :tags + ) + RETURNING op_id + """), + { + "input": json.dumps(claim_input, ensure_ascii=False), + "dry_run_result": json.dumps({ + "check_mode_executed": False, + "apply_executed": False, + "claim_state": "claimed", + }, ensure_ascii=False), + "parent_op_id": source_op_id, + "tags": ["ansible", "check_mode", "pending", "apply_locked"], + }, + ) + op_id = str(inserted.scalar_one()) + claims.append( + AnsibleCheckModeClaim( + op_id=op_id, + source_candidate_op_id=source_op_id, + incident_id=str(claim_input.get("incident_id") or ""), + catalog_id=str(claim_input["catalog_id"]), + playbook_path=str(claim_input["playbook_path"]), + inventory_hosts=tuple(str(host) for host in claim_input["inventory_hosts"]), + input_payload=claim_input, + ) + ) + return claims + + +async def _insert_skipped_candidate( + db: Any, + *, + source_candidate_op_id: str, + candidate_input: dict[str, Any], + reason: str, +) -> None: + input_payload = { + "incident_id": str(candidate_input.get("incident_id") or ""), + "executor": "ansible", + "execution_backend": "ansible", + "execution_mode": "check_mode", + "check_mode": True, + "apply_enabled": False, + "source_candidate_op_id": source_candidate_op_id, + "not_used_reason": reason, + } + await db.execute( + text(""" + INSERT INTO automation_operation_log ( + operation_type, actor, status, + input, output, dry_run_result, + parent_op_id, tags + ) VALUES ( + 'ansible_execution_skipped', + 'ansible_check_mode_worker', + 'dry_run', + CAST(:input AS jsonb), + CAST(:output AS jsonb), + CAST(:dry_run_result AS jsonb), + CAST(:parent_op_id AS uuid), + :tags + ) + """), + { + "input": json.dumps(input_payload, ensure_ascii=False), + "output": json.dumps({ + "not_used_reason": reason, + "decision_effect": "skipped_before_runtime", + }, ensure_ascii=False), + "dry_run_result": json.dumps({ + "check_mode_executed": False, + "apply_executed": False, + "skipped": True, + "reason": reason, + }, ensure_ascii=False), + "parent_op_id": source_candidate_op_id, + "tags": ["ansible", "check_mode", "skipped", "apply_locked"], + }, + ) + + +async def finalize_check_mode_claim( + claim: AnsibleCheckModeClaim, + result: AnsibleRunResult, + *, + project_id: str = "awoooi", +) -> None: + status, output, dry_run_result, error = _build_result_payload(result) + async with get_db_context(project_id) as db: + await db.execute( + text(""" + UPDATE automation_operation_log + SET status = :status, + output = CAST(:output AS jsonb), + dry_run_result = CAST(:dry_run_result AS jsonb), + error = :error, + duration_ms = :duration_ms, + stderr_feed_back = :stderr + WHERE op_id = CAST(:op_id AS uuid) + """), + { + "status": status, + "output": json.dumps(output, ensure_ascii=False), + "dry_run_result": json.dumps(dry_run_result, ensure_ascii=False), + "error": _tail(error or "", 2000) or None, + "duration_ms": result.duration_ms, + "stderr": _tail(result.stderr, _STDERR_LIMIT), + "op_id": claim.op_id, + }, + ) + + +async def run_claimed_check_mode( + claim: AnsibleCheckModeClaim, + *, + timeout_seconds: int, + project_id: str = "awoooi", +) -> AnsibleRunResult: + try: + spec = build_ansible_check_mode_command( + playbook_path=claim.playbook_path, + inventory_hosts=claim.inventory_hosts, + ) + result = await _run_ansible_command(spec, timeout_seconds=timeout_seconds) + except Exception as exc: + result = AnsibleRunResult( + returncode=1, + stdout="", + stderr=f"ansible_check_mode_runtime_error: {exc}", + duration_ms=0, + ) + await finalize_check_mode_claim(claim, result, project_id=project_id) + logger.info( + "ansible_check_mode_claim_completed", + op_id=claim.op_id, + source_candidate_op_id=claim.source_candidate_op_id, + incident_id=claim.incident_id, + catalog_id=claim.catalog_id, + returncode=result.returncode, + timed_out=result.timed_out, + ) + return result + + +async def run_pending_check_modes_once( + *, + project_id: str = "awoooi", + limit: int = 1, + timeout_seconds: int | None = None, +) -> dict[str, Any]: + blockers = _runtime_blockers() + if blockers: + logger.warning("ansible_check_mode_runtime_blocked", blockers=blockers) + return {"claimed": 0, "completed": 0, "failed": 0, "blockers": blockers} + + claims = await claim_pending_check_modes(project_id=project_id, limit=limit) + completed = 0 + failed = 0 + for claim in claims: + result = await run_claimed_check_mode( + claim, + timeout_seconds=timeout_seconds or settings.AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS, + project_id=project_id, + ) + completed += 1 + if result.returncode != 0: + failed += 1 + return { + "claimed": len(claims), + "completed": completed, + "failed": failed, + "blockers": [], + } diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index 5299b984..33144c4d 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -691,18 +691,32 @@ def _execution_backend_summary(records: list[dict[str, Any]]) -> dict[str, Any]: summary["ansible_considered_total"] += 1 summary["ansible_audit_record_total"] += len(ansible_records) summary["ansible_candidate_total"] += len(candidates) + terminal_check_mode_parent_ids = { + str(row.get("parent_op_id")) + for row in ansible_records + if isinstance(row, dict) + and str(row.get("operation_type") or "") in { + "ansible_check_mode_executed", + "ansible_execution_skipped", + } + and row.get("parent_op_id") + } for row in ansible_records: if not isinstance(row, dict): continue operation_type = str(row.get("operation_type") or "") - if operation_type == "ansible_check_mode_executed": + status = str(row.get("status") or "").lower() + if operation_type == "ansible_check_mode_executed" and status != "pending": summary["ansible_check_mode_total"] += 1 elif operation_type == "ansible_apply_executed": summary["ansible_apply_total"] += 1 elif operation_type == "ansible_rollback_executed": summary["ansible_rollback_total"] += 1 - elif operation_type == "ansible_candidate_matched": + elif ( + operation_type == "ansible_candidate_matched" + and str(row.get("op_id")) not in terminal_check_mode_parent_ids + ): summary["ansible_pending_check_mode_total"] += 1 return summary @@ -1233,6 +1247,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ status, incident_id, run_id, + parent_op_id, actor, dry_run_result, error, diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index 0fe1a590..0d2d59c9 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -9,11 +9,16 @@ from src.services.awooop_ansible_audit_service import ( build_ansible_decision_audit_payload, build_ansible_truth, ) +from src.services.awooop_ansible_check_mode_service import ( + build_ansible_check_mode_claim_input, + build_ansible_check_mode_command, +) from src.services.awooop_truth_chain_service import ( _ansible_playbook_roots, _ansible_runtime_readiness, _automation_quality_score_bucket, _clean_row, + _execution_backend_summary, _incident_fingerprints, _summarize_gateway_mcp, _truth_status, @@ -855,3 +860,149 @@ def test_ansible_decision_audit_payload_is_dry_run_only() -> None: assert payload["input"]["executor_candidates"] assert payload["output"]["decision_effect"] == "audit_only" assert payload["dry_run_result"]["check_mode_executed"] is False + + +def test_ansible_decision_audit_payload_exposes_check_mode_safety_flags() -> None: + incident = SimpleNamespace( + incident_id="INC-MOMO", + project_id="awoooi", + alert_category="database", + notification_type="TYPE-3", + severity=SimpleNamespace(value="P3"), + affected_services=["momo"], + signals=[ + SimpleNamespace( + alert_name="MomoPostgresBackupFailed", + labels={"alertname": "MomoPostgresBackupFailed", "instance": "188"}, + annotations={}, + ) + ], + ) + + payload = build_ansible_decision_audit_payload( + incident=incident, + proposal_data={"source": "expert_system", "risk_level": "low"}, + decision_path="manual_approval", + not_used_reason="candidate audit", + ) + + candidate = payload["input"]["executor_candidates"][0] + assert candidate["catalog_id"] == "ansible:188-ai-web" + assert candidate["supports_check_mode"] is True + assert candidate["auto_apply_enabled"] is False + assert candidate["approval_required"] is True + + +def test_ansible_check_mode_claim_input_keeps_apply_locked() -> None: + candidate_input = { + "incident_id": "INC-MOMO", + "executor": "ansible", + "executor_candidates": [ + { + "catalog_id": "ansible:188-ai-web", + "playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "inventory_hosts": ["host_188"], + "risk_level": "medium", + } + ], + } + + claim = build_ansible_check_mode_claim_input( + source_candidate_op_id="00000000-0000-0000-0000-000000000001", + candidate_input=candidate_input, + ) + + assert claim["execution_mode"] == "check_mode" + assert claim["check_mode"] is True + assert claim["diff"] is True + assert claim["apply_enabled"] is False + assert claim["approval_required_before_apply"] is True + assert claim["playbook_path"] == "infra/ansible/playbooks/188-ai-web.yml" + + +def test_ansible_check_mode_claim_rejects_non_check_mode_catalog() -> None: + candidate_input = { + "incident_id": "INC-SSH", + "executor": "ansible", + "executor_candidates": [ + { + "catalog_id": "ansible:restore-password-auth", + "playbook_path": "infra/ansible/playbooks/restore-password-auth.yml", + "inventory_hosts": ["host_188"], + "risk_level": "high", + } + ], + } + + try: + build_ansible_check_mode_claim_input( + source_candidate_op_id="00000000-0000-0000-0000-000000000002", + candidate_input=candidate_input, + ) + except ValueError as exc: + assert str(exc) == "no_safe_check_mode_candidate" + else: + raise AssertionError("non-check-mode catalog should be rejected") + + +def test_ansible_check_mode_command_uses_check_diff_and_repair_ssh(tmp_path: Path) -> None: + playbook_root = tmp_path / "infra" / "ansible" + playbook_dir = playbook_root / "playbooks" + inventory_dir = playbook_root / "inventory" + playbook_dir.mkdir(parents=True) + inventory_dir.mkdir(parents=True) + (playbook_dir / "188-ai-web.yml").write_text("---\n- hosts: host_188\n tasks: []\n") + (inventory_dir / "hosts.yml").write_text("all: {}\n") + repair_key = tmp_path / "id_ed25519" + known_hosts = tmp_path / "known_hosts" + repair_key.write_text("key") + known_hosts.write_text("host key") + + spec = build_ansible_check_mode_command( + playbook_path="infra/ansible/playbooks/188-ai-web.yml", + inventory_hosts=("host_188",), + playbook_root=playbook_root, + repair_ssh_key_path=repair_key, + repair_known_hosts_path=known_hosts, + ) + + assert "--check" in spec.command + assert "--diff" in spec.command + assert "--limit" in spec.command + assert "host_188" in spec.command + assert "ansible_ssh_private_key_file" in spec.command[-1] + assert str(repair_key) in spec.command[-1] + assert str(known_hosts) in spec.command[-1] + assert "apply" not in " ".join(spec.command) + + +def test_execution_backend_summary_subtracts_completed_check_mode_parent() -> None: + summary = _execution_backend_summary([ + { + "execution": { + "ansible": { + "considered": True, + "candidate_catalog": {"candidates": [{"catalog_id": "ansible:188-ai-web"}]}, + "records": [ + { + "op_id": "candidate-1", + "operation_type": "ansible_candidate_matched", + "status": "dry_run", + }, + { + "op_id": "check-1", + "parent_op_id": "candidate-1", + "operation_type": "ansible_check_mode_executed", + "status": "success", + }, + ], + }, + "automation_operation_log": [], + "auto_repair_executions": [], + }, + "automation_quality": {"facts": {}}, + } + ]) + + assert summary["ansible_check_mode_total"] == 1 + assert summary["ansible_pending_check_mode_total"] == 0 diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 3f6bf4a9..52722c08 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -101,6 +101,16 @@ spec: value: "80" - name: PROMETHEUS_MULTIPROC_DIR value: "/tmp/awoooi-prometheus-multiproc" + - name: ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER + value: "true" + - name: AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS + value: "300" + - name: AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT + value: "1" + - name: AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS + value: "180" + - name: AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS + value: "120" # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用 volumeMounts: - name: repair-ssh-key