diff --git a/apps/api/migrations/adr090d_ansible_operation_types.sql b/apps/api/migrations/adr090d_ansible_operation_types.sql new file mode 100644 index 00000000..636c14c5 --- /dev/null +++ b/apps/api/migrations/adr090d_ansible_operation_types.sql @@ -0,0 +1,36 @@ +-- ADR-090-D: automation_operation_log.operation_type adds Ansible executor audit states +-- Created: 2026-05-12 Taipei +-- +-- Purpose: +-- T3 Ansible declarative executor visibility. These operation types allow +-- the AI automation truth chain to record that Ansible was matched, +-- check-mode executed, applied, rolled back, or explicitly skipped. +-- +-- Safety: +-- This migration only expands the CHECK allowlist. It does not execute +-- Ansible, change approval behavior, or create auto-remediation rows. + +ALTER TABLE automation_operation_log + DROP CONSTRAINT IF EXISTS automation_operation_log_type_valid; + +ALTER TABLE automation_operation_log + ADD CONSTRAINT automation_operation_log_type_valid CHECK (operation_type IN ( + 'monitor_configured','monitor_removed', + 'alert_fired','alert_suppressed','alert_routed', + 'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated', + 'playbook_generated','playbook_updated','playbook_executed', + 'remediation_executed','remediation_verified','remediation_rolled_back', + 'self_correction_attempted', + 'km_created','km_updated','km_linked', + 'asset_discovered','coverage_recalculated', + 'capacity_recommendation','quota_enforced', + 'notification_formatted', + 'ansible_candidate_matched', + 'ansible_check_mode_executed', + 'ansible_apply_executed', + 'ansible_rollback_executed', + 'ansible_execution_skipped' + )); + +COMMENT ON CONSTRAINT automation_operation_log_type_valid ON automation_operation_log IS + 'ADR-090-D: allow first-class Ansible executor audit states for AwoooP truth-chain visibility.'; diff --git a/apps/api/migrations/adr090d_ansible_operation_types_down.sql b/apps/api/migrations/adr090d_ansible_operation_types_down.sql new file mode 100644 index 00000000..948bdb47 --- /dev/null +++ b/apps/api/migrations/adr090d_ansible_operation_types_down.sql @@ -0,0 +1,19 @@ +-- ADR-090-D rollback: remove Ansible executor audit states from operation_type allowlist. +-- Only apply after confirming no automation_operation_log rows use ansible_* operation types. + +ALTER TABLE automation_operation_log + DROP CONSTRAINT IF EXISTS automation_operation_log_type_valid; + +ALTER TABLE automation_operation_log + ADD CONSTRAINT automation_operation_log_type_valid CHECK (operation_type IN ( + 'monitor_configured','monitor_removed', + 'alert_fired','alert_suppressed','alert_routed', + 'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated', + 'playbook_generated','playbook_updated','playbook_executed', + 'remediation_executed','remediation_verified','remediation_rolled_back', + 'self_correction_attempted', + 'km_created','km_updated','km_linked', + 'asset_discovered','coverage_recalculated', + 'capacity_recommendation','quota_enforced', + 'notification_formatted' + )); diff --git a/apps/api/src/services/awooop_ansible_audit_service.py b/apps/api/src/services/awooop_ansible_audit_service.py new file mode 100644 index 00000000..74d5b728 --- /dev/null +++ b/apps/api/src/services/awooop_ansible_audit_service.py @@ -0,0 +1,262 @@ +"""AwoooP Ansible audit helpers. + +This module is intentionally non-executing. It exposes the Ansible audit +contract and repo-known playbook catalog so the truth chain can say whether +Ansible was actually considered or executed, without pretending that catalog +hints are runtime remediation. +""" + +from __future__ import annotations + +from typing import Any + + +ANSIBLE_OPERATION_TYPES = frozenset({ + "ansible_candidate_matched", + "ansible_check_mode_executed", + "ansible_apply_executed", + "ansible_rollback_executed", + "ansible_execution_skipped", +}) + +_CATALOG: tuple[dict[str, Any], ...] = ( + { + "catalog_id": "ansible:110-devops", + "playbook_path": "infra/ansible/playbooks/110-devops.yml", + "inventory_hosts": ["host_110"], + "domains": ["swap", "harbor", "sentry", "gitea", "langfuse", "bitan", "runner", "keepalived", "nginx"], + "keywords": [ + "110", + "swap", + "harbor", + "sentry", + "gitea", + "langfuse", + "bitan", + "runner", + "github-runner", + "keepalived", + ], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:188-ai-web", + "playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "inventory_hosts": ["host_188"], + "domains": ["docker", "momo_backup", "signoz", "minio", "litellm", "n8n", "open_webui", "nginx"], + "keywords": [ + "188", + "momo", + "backup", + "postgresql", + "pg_backup", + "signoz", + "minio", + "litellm", + "n8n", + "open-webui", + "openwebui", + "docker-registry", + ], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:nginx-sync", + "playbook_path": "infra/ansible/playbooks/nginx-sync.yml", + "inventory_hosts": ["host_110", "host_188"], + "domains": ["nginx", "proxy", "ollama_proxy", "tls"], + "keywords": ["nginx", "proxy", "ollama", "gcp", "tls", "cert", "502", "upstream"], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:restore-password-auth", + "playbook_path": "infra/ansible/playbooks/restore-password-auth.yml", + "inventory_hosts": ["host_110", "host_120", "host_121", "host_188"], + "domains": ["ssh", "password_auth"], + "keywords": ["ssh", "passwordauthentication", "password auth", "login", "auth"], + "supports_check_mode": False, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "high", + }, +) + + +def _get(row: dict[str, Any], key: str) -> Any: + return row.get(key) + + +def _tags(row: dict[str, Any]) -> list[str]: + raw = _get(row, "tags") + if isinstance(raw, list): + return [str(item).lower() for item in raw] + if isinstance(raw, str): + return [part.strip().lower() for part in raw.split(",") if part.strip()] + return [] + + +def _first_present(row: dict[str, Any], keys: tuple[str, ...]) -> Any: + for key in keys: + value = _get(row, key) + if value not in (None, ""): + return value + return None + + +def _is_ansible_operation(row: dict[str, Any]) -> bool: + operation_type = str(_get(row, "operation_type") or "").lower() + if operation_type in ANSIBLE_OPERATION_TYPES: + return True + if "ansible" in _tags(row): + return True + executor = str( + _first_present( + row, + ( + "input_executor", + "input_execution_backend", + "output_executor", + "output_execution_backend", + ), + ) + or "" + ).lower() + if executor == "ansible": + return True + playbook_path = str( + _first_present(row, ("input_playbook_path", "output_playbook_path", "input_ansible_playbook_path", "output_ansible_playbook_path")) + or "" + ).lower() + return "infra/ansible/" in playbook_path or playbook_path.endswith(".yml") and "ansible" in playbook_path + + +def _ansible_record(row: dict[str, Any]) -> dict[str, Any]: + return { + "op_id": _get(row, "op_id"), + "operation_type": _get(row, "operation_type"), + "status": _get(row, "status"), + "actor": _get(row, "actor"), + "playbook_id": _first_present(row, ("input_playbook_id", "output_playbook_id")), + "playbook_path": _first_present( + row, + ("input_playbook_path", "output_playbook_path", "input_ansible_playbook_path", "output_ansible_playbook_path"), + ), + "check_mode": _first_present(row, ("input_check_mode", "output_check_mode")), + "not_used_reason": _first_present(row, ("input_not_used_reason", "output_not_used_reason")), + "dry_run_result": _get(row, "dry_run_result"), + "error": _get(row, "error"), + "duration_ms": _get(row, "duration_ms"), + "tags": _get(row, "tags"), + "created_at": _get(row, "created_at"), + } + + +def _flatten_text(value: Any, pieces: list[str], remaining: int = 80) -> int: + if remaining <= 0 or value is None: + return remaining + if isinstance(value, dict): + for key, item in value.items(): + remaining = _flatten_text(key, pieces, remaining) + remaining = _flatten_text(item, pieces, remaining) + if remaining <= 0: + break + return remaining + if isinstance(value, list): + for item in value: + remaining = _flatten_text(item, pieces, remaining) + if remaining <= 0: + break + return remaining + pieces.append(str(value).lower()) + return remaining - 1 + + +def _source_haystack(incident: dict[str, Any] | None, drift: dict[str, Any] | None) -> str: + pieces: list[str] = [] + _flatten_text(incident, pieces) + _flatten_text(drift, pieces) + return " ".join(pieces) + + +def _catalog_hints(incident: dict[str, Any] | None, drift: dict[str, Any] | None) -> dict[str, Any]: + haystack = _source_haystack(incident, drift) + candidates: list[dict[str, Any]] = [] + unmatched: list[str] = [] + for item in _CATALOG: + matched = [keyword for keyword in item["keywords"] if keyword in haystack] + public_item = { + key: value + for key, value in item.items() + if key + in { + "catalog_id", + "playbook_path", + "inventory_hosts", + "domains", + "supports_check_mode", + "auto_apply_enabled", + "approval_required", + "risk_level", + } + } + if matched: + candidates.append({ + **public_item, + "match_score": len(matched), + "matched_keywords": matched, + }) + else: + unmatched.append(item["catalog_id"]) + candidates.sort(key=lambda row: (-int(row["match_score"]), str(row["catalog_id"]))) + return { + "match_mode": "static_catalog_keyword_hint_v1", + "decision_effect": "none", + "available_count": len(_CATALOG), + "candidates": candidates, + "unmatched_catalog_ids": unmatched, + } + + +def build_ansible_truth( + automation_ops: list[dict[str, Any]], + *, + incident: dict[str, Any] | None, + drift: dict[str, Any] | None, +) -> dict[str, Any]: + """Build the truth-chain Ansible section from audited facts and catalog hints.""" + + records = [_ansible_record(row) for row in automation_ops if _is_ansible_operation(row)] + return { + "considered": bool(records), + "records": records, + "audit_contract": { + "schema_version": "ansible_executor_audit_v1", + "operation_types": sorted(ANSIBLE_OPERATION_TYPES), + "required_audit_fields": [ + "operation_type", + "status", + "actor", + "input.executor", + "input.playbook_path", + "input.check_mode", + "output.not_used_reason", + "dry_run_result", + ], + "default_execution_mode": "catalog/dry-run audit only until approval execution is explicitly wired", + }, + "candidate_catalog": _catalog_hints(incident, drift), + "not_used_reason": ( + None + if records + else "no automation_operation_log row with Ansible operation type, tag, or executor backend for this source" + ), + } diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index 78867133..ed182eb2 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -17,6 +17,7 @@ import structlog from sqlalchemy import text from src.db.base import get_db_context +from src.services.awooop_ansible_audit_service import build_ansible_truth logger = structlog.get_logger(__name__) @@ -421,15 +422,30 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ error, duration_ms, tags, + input ->> 'executor' AS input_executor, + input ->> 'execution_backend' AS input_execution_backend, + input ->> 'playbook_id' AS input_playbook_id, + input ->> 'playbook_path' AS input_playbook_path, + input ->> 'ansible_playbook_path' AS input_ansible_playbook_path, + input ->> 'check_mode' AS input_check_mode, + input ->> 'not_used_reason' AS input_not_used_reason, + output ->> 'executor' AS output_executor, + output ->> 'execution_backend' AS output_execution_backend, + output ->> 'playbook_id' AS output_playbook_id, + output ->> 'playbook_path' AS output_playbook_path, + output ->> 'ansible_playbook_path' AS output_ansible_playbook_path, + output ->> 'check_mode' AS output_check_mode, + output ->> 'not_used_reason' AS output_not_used_reason, created_at FROM automation_operation_log - WHERE coalesce(input::text, '') LIKE :needle + WHERE incident_id::text = :incident_id + OR coalesce(input::text, '') LIKE :needle OR coalesce(output::text, '') LIKE :needle OR coalesce(array_to_string(tags, ','), '') LIKE :needle ORDER BY created_at DESC LIMIT :limit """, - {"needle": f"%{incident_id}%", "limit": _MAX_ROWS}, + {"incident_id": incident_id, "needle": f"%{incident_id}%", "limit": _MAX_ROWS}, ) km_entries = await _fetch_all( db, @@ -626,11 +642,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ }, "execution": { "automation_operation_log": automation_ops, - "ansible": { - "considered": False, - "records": [], - "not_used_reason": "no first-class Ansible executor audit record in current truth chain", - }, + "ansible": build_ansible_truth(automation_ops, incident=incident, drift=drift), }, "learning": { "knowledge_entries": km_entries, diff --git a/apps/api/src/services/incident_timeline_service.py b/apps/api/src/services/incident_timeline_service.py index b14dbccc..975f496a 100644 --- a/apps/api/src/services/incident_timeline_service.py +++ b/apps/api/src/services/incident_timeline_service.py @@ -104,6 +104,11 @@ _AUTOMATION_STAGE_MAP = { "capacity_recommendation": "investigator", "quota_enforced": "safe", "notification_formatted": "safe", + "ansible_candidate_matched": "ai_router", + "ansible_check_mode_executed": "executor", + "ansible_apply_executed": "executor", + "ansible_rollback_executed": "executor", + "ansible_execution_skipped": "safe", } _AUTOMATION_STATUS_MAP = { "pending": "pending", diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index bf99813e..b872e489 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1,5 +1,6 @@ from __future__ import annotations +from src.services.awooop_ansible_audit_service import build_ansible_truth from src.services.awooop_truth_chain_service import _clean_row, _truth_status @@ -61,3 +62,48 @@ def test_truth_status_marks_repeated_pending_drift_as_human_needed() -> None: assert status["needs_human"] is True assert "drift_report_pending_without_resolution" in status["blockers"] assert "drift_ai_confidence_zero" in status["blockers"] + + +def test_ansible_truth_surfaces_audited_check_mode_record() -> None: + truth = build_ansible_truth( + [ + { + "op_id": "op-ansible-1", + "operation_type": "ansible_check_mode_executed", + "status": "dry_run", + "actor": "platform_operator", + "input_playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "input_check_mode": "true", + "dry_run_result": {"changed": 1}, + "tags": ["ansible", "check_mode"], + "created_at": "2026-05-12T22:00:00+08:00", + } + ], + incident={"incident_id": "INC-1", "alertname": "momo pg_backup failed on 188"}, + drift=None, + ) + + assert truth["considered"] is True + assert truth["not_used_reason"] is None + assert truth["records"][0]["playbook_path"] == "infra/ansible/playbooks/188-ai-web.yml" + assert truth["records"][0]["check_mode"] == "true" + assert truth["records"][0]["dry_run_result"] == {"changed": 1} + assert "ansible_check_mode_executed" in truth["audit_contract"]["operation_types"] + assert truth["candidate_catalog"]["decision_effect"] == "none" + assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:188-ai-web" + assert truth["candidate_catalog"]["candidates"][0]["auto_apply_enabled"] is False + + +def test_ansible_truth_keeps_catalog_hint_separate_from_runtime_use() -> None: + truth = build_ansible_truth( + [], + incident={"incident_id": "INC-2", "alertname": "nginx 502 upstream timeout"}, + drift=None, + ) + + assert truth["considered"] is False + assert truth["records"] == [] + assert truth["not_used_reason"].startswith("no automation_operation_log row") + assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:nginx-sync" + assert truth["candidate_catalog"]["candidates"][0]["approval_required"] is True + assert truth["candidate_catalog"]["decision_effect"] == "none" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 11cdb3c5..90400cf4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,39 @@ +## 2026-05-12 | T3 Ansible audit surface 第一段 + +**背景**:Telegram / truth-chain live audit 顯示 Ansible 目前仍只是 repo/主機部署工具,沒有出現在 AI 自動化修復鏈路的 first-class audit record;Operator 無法知道「是否被考慮、是否 dry-run、為何沒用」。 + +**修正**: +- 新增 migration `adr090d_ansible_operation_types.sql`,擴充 `automation_operation_log.operation_type`: + - `ansible_candidate_matched` + - `ansible_check_mode_executed` + - `ansible_apply_executed` + - `ansible_rollback_executed` + - `ansible_execution_skipped` +- 新增 rollback migration `adr090d_ansible_operation_types_down.sql`;`run-migration.yml` 會跳過 `_down.sql`。 +- 新增 `awooop_ansible_audit_service.py`: + - 讀取 automation ops 中的 Ansible operation type/tag/backend。 + - 暴露 repo 既有 playbook catalog hint。 + - 明確標示 `decision_effect=none`,避免把候選 playbook 當成已執行。 +- truth-chain `execution.ansible` 現在會顯示: + - `considered` 是否有真實 Ansible audit record。 + - `records`、`audit_contract`、`candidate_catalog`、`not_used_reason`。 +- `incident_timeline_service` 補 Ansible operation type → stage mapping。 + +**驗證**: +- `py_compile`:Ansible audit service / truth-chain / incident timeline / truth-chain tests 通過。 +- `ruff --select F,E9`:All checks passed。 +- `pytest apps/api/tests/test_awooop_truth_chain_service.py apps/api/tests/test_platform_router_order.py apps/api/tests/test_awooop_operator_auth.py -q`:13 passed。 +- `ruby YAML.load_file(".gitea/workflows/run-migration.yml")`:ok。 +- `git diff --check`:ok。 + +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible first-class audit contract / truth-chain 可見性完成;尚未把 approval execution path 寫入 Ansible dry-run/check-mode。 +- 下一步:推版後觀察 `run-migration`,確認新增 migration 與 audit seed 都通過。 + ## 2026-05-12 | run-migration audit seed 再修正 **背景**:Gitea `run-migration` 在 `Seed asset_discovery_run (audit)` 再次失敗: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 9292dd8e..87c14867 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1892,6 +1892,32 @@ Phase 6 完成後 --- +### 2026-05-12 晚 (台北) — T3 Ansible declarative executor audit surface 第一段 + +**範圍**: +- `automation_operation_log.operation_type` CHECK 追加 Ansible executor audit states: + `ansible_candidate_matched` / `ansible_check_mode_executed` / + `ansible_apply_executed` / `ansible_rollback_executed` / + `ansible_execution_skipped`。 +- 新增 `awooop_ansible_audit_service.py`,把 repo 既有 Ansible playbook catalog 以 + read-only 方式暴露給 truth-chain。 +- truth-chain `execution.ansible` 改為顯示: + - 是否真的有 `automation_operation_log` Ansible audit record。 + - audit contract / required fields。 + - static catalog keyword hints,且 `decision_effect=none`,避免把候選 playbook 誤判成已自動修復。 +- `incident_timeline_service` 加入 Ansible operation type stage mapping。 + +**已驗證**: +- 本地 `py_compile` / `ruff F,E9` / `git diff --check` 通過。 +- `test_awooop_truth_chain_service.py`、router order、operator auth 共 13 passed。 +- `run-migration.yml` YAML parse 通過;新增 `_down.sql` 會被既有 workflow skip 規則排除。 + +**仍未宣稱完成**: +- 這不是 Ansible 自動修復執行器接線;目前只建立 first-class audit contract 與 truth-chain 可見性。 +- 下一段需把 decision / approval execution path 在「只 dry-run/check-mode」下寫入上述 operation types,再談 apply。 + +--- + ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) **觸發**:統帥全景盤查 AI 自動化節點後,發現 Playbook 自動修復鏈路有 3 個結構性斷點。