From b4d367eeb463eccda5aec8aa9c90f19897dbd634 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 03:21:31 +0800 Subject: [PATCH 01/15] feat(awooop): expose mcp bridge truth chain --- .../services/awooop_truth_chain_service.py | 13 +++- .../tests/test_awooop_truth_chain_service.py | 17 ++++- docs/LOGBOOK.md | 37 ++++++++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 2 + ...awooop-mcp-gateway-bridge-backfill-24h.sql | 69 +++++++++++++++++++ 5 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index d1e669dd..78867133 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -7,6 +7,7 @@ Telegram cards can be audited without guessing which subsystem owns the truth. from __future__ import annotations +import json from datetime import date, datetime from decimal import Decimal from typing import Any @@ -20,6 +21,7 @@ from src.db.base import get_db_context logger = structlog.get_logger(__name__) _MAX_ROWS = 100 +_JSON_TEXT_FIELDS = {"gate_result", "source_envelope"} def _clean(value: Any) -> Any: @@ -38,7 +40,15 @@ def _clean(value: Any) -> Any: def _clean_row(row: Any) -> dict[str, Any]: - return {key: _clean(value) for key, value in dict(row).items()} + cleaned: dict[str, Any] = {} + for key, value in dict(row).items(): + if key in _JSON_TEXT_FIELDS and isinstance(value, str): + try: + value = json.loads(value) + except json.JSONDecodeError: + pass + cleaned[key] = _clean(value) + return cleaned async def _fetch_all(db: Any, sql: str, params: dict[str, Any]) -> list[dict[str, Any]]: @@ -507,6 +517,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ trace_id, agent_id, tool_name, + gate_result, result_status, block_gate, block_reason, diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index 4c308732..bf99813e 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1,6 +1,21 @@ from __future__ import annotations -from src.services.awooop_truth_chain_service import _truth_status +from src.services.awooop_truth_chain_service import _clean_row, _truth_status + + +def test_clean_row_parses_json_text_fields_for_gateway_visibility() -> None: + row = { + "gate_result": '{"schema_version":"legacy_mcp_bridge_v1","policy_enforced":false}', + "source_envelope": '{"adapter":"legacy_telegram_gateway"}', + "plain_text": '{"not":"parsed"}', + } + + cleaned = _clean_row(row) + + assert cleaned["gate_result"]["schema_version"] == "legacy_mcp_bridge_v1" + assert cleaned["gate_result"]["policy_enforced"] is False + assert cleaned["source_envelope"]["adapter"] == "legacy_telegram_gateway" + assert cleaned["plain_text"] == '{"not":"parsed"}' def test_truth_status_marks_no_action_approval_as_manual_required() -> None: diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 33522e80..3140f747 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6505,3 +6505,40 @@ gateway_audit_total=0 last_15m=0 bridge_total=0 - 因此目前只能宣稱「T2 bridge 寫入能力已部署並經 rollback smoke 驗證」。 - 尚不能宣稱「所有 MCP / 自建 MCP 都已完全經 AwoooP Gateway 強制治理」;下一段要讓下一個真實 incident / MCP 呼叫自然產生 durable bridge row,或把高頻 caller 改成 first-class `McpGateway`。 + +**T2 backfill / truth-chain visibility 追加**: + +- 新增 `scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql`: + - 將最近 24h 真實 `mcp_audit_log` 鏡像到 `awooop_mcp_gateway_audit`。 + - 以 `gate_result.legacy_audit_id` 做 idempotency key。 + - bridge row 保留 `policy_enforced=false` 與 `not_used_reason`,避免誤判為五閘門已 enforcement。 +- production 已執行 backfill: + +```text +inserted_bridge_rows=1160 +gateway_total=1310 bridge_total=1310 last_24h=1276 +B6C589_gateway_rows=8 failed=8 success=0 +``` + +- truth-chain API 追加 `gate_result` 欄位,並把 JSONB text 解析回物件,讓 UI 能顯示 bridge reason。 + +```text +py_compile: +apps/api/src/services/awooop_truth_chain_service.py +apps/api/tests/test_awooop_truth_chain_service.py +# OK + +ruff F,E9: +# All checks passed + +pytest: +apps/api/tests/test_awooop_truth_chain_service.py +apps/api/tests/test_platform_router_order.py +apps/api/tests/test_awooop_operator_auth.py +# 11 passed +``` + +**效果**: + +- `INC-20260512-B6C589` truth-chain 現在不再是 `awooop_mcp_gateway_audit_empty`。 +- 仍顯示 `manual_required/blocked`,因為 8 個 SSH MCP 都失敗,approval/incident 狀態仍矛盾;這是 T5 要處理,不能用 T2 粉飾成自動修復完成。 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 15f6a18c..966a03a5 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1883,6 +1883,8 @@ Phase 6 完成後 - T2 bridge image `94d006ea` 已部署,CD run `1921` success,health 200。 - rollback smoke 證明 `record_mcp_call()` 在同一 transaction 內會同時寫 legacy `mcp_audit_log` 與 `awooop_mcp_gateway_audit` bridge row,且 bridge row 標示 `policy_enforced=false` / `not_used_reason=legacy direct provider path; bridge audit only`;rollback 後兩邊皆未污染 production。 - 部署後短觀察窗內沒有自然新 legacy MCP call(`legacy_mcp_15m=0`),所以 live `awooop_mcp_gateway_audit` total 仍是 0。T2 bridge capability 已上線,但 T2 全退出條件仍需下一個真實 MCP 呼叫產生 durable row,或把高頻 caller 改成 first-class Gateway path。 +- 已執行最近 24h 真實 legacy MCP backfill:`inserted_bridge_rows=1160`,目前 `awooop_mcp_gateway_audit gateway_total=1310 / bridge_total=1310 / last_24h=1276`。`INC-20260512-B6C589` 現在 gateway side 可見 8 筆 MCP,8 failed / 0 success;truth-chain blocker 移除 `awooop_mcp_gateway_audit_empty`,但仍是 `manual_required/blocked`,因為 evidence sensors 全失敗、NO_ACTION approval 無 execution、incident 仍 investigating。 +- truth-chain API 追加回傳 `gate_result`,讓 Operator Console 可直接顯示 `policy_enforced=false` 與 `not_used_reason`,避免把 bridge row 誤認為 first-class Gateway enforcement。 **仍未宣稱完成**: - 這只是 legacy bridge,不是把所有呼叫強制改經 AwoooP Gateway;T2 後續仍要把新 MCP caller 收斂到 first-class Gateway path。 diff --git a/scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql b/scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql new file mode 100644 index 00000000..509621f3 --- /dev/null +++ b/scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql @@ -0,0 +1,69 @@ +-- AwoooP T2 MCP Gateway bridge backfill (24h) +-- 2026-05-12 Codex + ogt +-- +-- Purpose: +-- Mirror real legacy mcp_audit_log rows into awooop_mcp_gateway_audit so +-- truth-chain can show MCP usage for recent incidents while first-class +-- Gateway migration continues. These rows are explicitly marked as bridge +-- records and policy_enforced=false; they are not proof of five-gate +-- Gateway enforcement. +-- +-- Idempotency: +-- gate_result.legacy_audit_id stores the mcp_audit_log.id source key. +-- Re-running this SQL will only insert missing rows. + +WITH inserted AS ( + INSERT INTO awooop_mcp_gateway_audit ( + project_id, + run_id, + trace_id, + agent_id, + tool_name, + input_hash, + output_hash, + gate_result, + result_status, + block_gate, + block_reason, + latency_ms, + created_at + ) + SELECT + 'awoooi' AS project_id, + NULL::uuid AS run_id, + LEFT(COALESCE(src.incident_id, src.session_id), 128) AS trace_id, + LEFT(COALESCE(src.agent_role, 'legacy-mcp-provider'), 128) AS agent_id, + LEFT('legacy:' || src.mcp_server || ':' || src.tool_name, 128) AS tool_name, + encode(digest(COALESCE(src.input_params::text, 'null'), 'sha256'), 'hex') AS input_hash, + CASE + WHEN src.output_result IS NULL THEN NULL + ELSE encode(digest(src.output_result::text, 'sha256'), 'hex') + END AS output_hash, + jsonb_build_object( + 'schema_version', 'legacy_mcp_bridge_v1', + 'gateway_path', 'legacy_backfill', + 'policy_enforced', false, + 'not_used_reason', 'legacy direct provider path; bridge audit only', + 'legacy_audit_id', src.id::text, + 'legacy_mcp_server', src.mcp_server, + 'legacy_tool_name', src.tool_name, + 'flywheel_node', src.flywheel_node + ) AS gate_result, + CASE WHEN src.success IS TRUE THEN 'success' ELSE 'failed' END AS result_status, + NULL::smallint AS block_gate, + CASE WHEN src.success IS TRUE THEN NULL ELSE LEFT(src.error_message, 256) END AS block_reason, + src.duration_ms AS latency_ms, + src.created_at + FROM mcp_audit_log src + WHERE src.created_at > NOW() - INTERVAL '24 hours' + AND NOT EXISTS ( + SELECT 1 + FROM awooop_mcp_gateway_audit dst + WHERE dst.project_id = 'awoooi' + AND dst.gate_result->>'schema_version' = 'legacy_mcp_bridge_v1' + AND dst.gate_result->>'legacy_audit_id' = src.id::text + ) + RETURNING call_id +) +SELECT COUNT(*) AS inserted_bridge_rows +FROM inserted; From dba3e405f4e83b9d722d14be07a931bf39c33551 Mon Sep 17 00:00:00 2001 From: AWOOOI CD Date: Wed, 13 May 2026 03:26:51 +0800 Subject: [PATCH 02/15] chore(cd): deploy b4d367e [skip ci] --- k8s/awoooi-prod/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index b0c7434f..3fb1a864 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -40,7 +40,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: 94d006eac88fd65f6efca817eb392a103ec10d3f + newTag: b4d367eeb463eccda5aec8aa9c90f19897dbd634 - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web - newTag: 94d006eac88fd65f6efca817eb392a103ec10d3f + newTag: b4d367eeb463eccda5aec8aa9c90f19897dbd634 From 124c3c545bf18e786ca2761e20918aa2ff9268fd Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 03:31:17 +0800 Subject: [PATCH 03/15] docs(awooop): record t2 truth-chain deployment --- docs/LOGBOOK.md | 27 +++++++++++++++++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 1 + 2 files changed, 28 insertions(+) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3140f747..76a8bce8 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6542,3 +6542,30 @@ apps/api/tests/test_awooop_operator_auth.py - `INC-20260512-B6C589` truth-chain 現在不再是 `awooop_mcp_gateway_audit_empty`。 - 仍顯示 `manual_required/blocked`,因為 8 個 SSH MCP 都失敗,approval/incident 狀態仍矛盾;這是 T5 要處理,不能用 T2 粉飾成自動修復完成。 + +**production deploy / endpoint smoke 追加(完成)**: + +```text +Gitea: +1928 CD Pipeline b4d367ee -> success +1929 Code Review b4d367ee -> success + +K8s image: +awoooi-api 192.168.0.110:5000/awoooi/api:b4d367eeb463eccda5aec8aa9c90f19897dbd634 +awoooi-worker 192.168.0.110:5000/awoooi/api:b4d367eeb463eccda5aec8aa9c90f19897dbd634 +awoooi-web 192.168.0.110:5000/awoooi/web:b4d367eeb463eccda5aec8aa9c90f19897dbd634 + +health: +http://192.168.0.125:32334/api/v1/health -> 200 healthy + +Truth-chain: +GET /api/v1/platform/truth-chain/INC-20260512-B6C589?project_id=awoooi -> 200 +stage=manual_required status=blocked needs_human=True +blockers=all_evidence_sensors_failed, + approval_resolved_no_action_without_execution, + incident_still_investigating_after_approval +gateway_total=8 legacy_total=8 +first_gateway_tool=legacy:ssh_host:ssh_get_nginx_error_log result=failed +gate_schema=legacy_mcp_bridge_v1 policy_enforced=False +not_used_reason=legacy direct provider path; bridge audit only +``` diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 966a03a5..9292dd8e 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1885,6 +1885,7 @@ Phase 6 完成後 - 部署後短觀察窗內沒有自然新 legacy MCP call(`legacy_mcp_15m=0`),所以 live `awooop_mcp_gateway_audit` total 仍是 0。T2 bridge capability 已上線,但 T2 全退出條件仍需下一個真實 MCP 呼叫產生 durable row,或把高頻 caller 改成 first-class Gateway path。 - 已執行最近 24h 真實 legacy MCP backfill:`inserted_bridge_rows=1160`,目前 `awooop_mcp_gateway_audit gateway_total=1310 / bridge_total=1310 / last_24h=1276`。`INC-20260512-B6C589` 現在 gateway side 可見 8 筆 MCP,8 failed / 0 success;truth-chain blocker 移除 `awooop_mcp_gateway_audit_empty`,但仍是 `manual_required/blocked`,因為 evidence sensors 全失敗、NO_ACTION approval 無 execution、incident 仍 investigating。 - truth-chain API 追加回傳 `gate_result`,讓 Operator Console 可直接顯示 `policy_enforced=false` 與 `not_used_reason`,避免把 bridge row 誤認為 first-class Gateway enforcement。 +- `b4d367ee` 已部署,CD run `1928` success。B6C589 endpoint smoke:`gateway_total=8 / legacy_total=8`,第一筆 gateway row 顯示 `gate_schema=legacy_mcp_bridge_v1`、`policy_enforced=False`、`not_used_reason=legacy direct provider path; bridge audit only`;truth status 仍是 `manual_required/blocked`。 **仍未宣稱完成**: - 這只是 legacy bridge,不是把所有呼叫強制改經 AwoooP Gateway;T2 後續仍要把新 MCP caller 收斂到 first-class Gateway path。 From feda8a0b4b7b51bbc073b1bcca27c92c3cd2b148 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 03:40:41 +0800 Subject: [PATCH 04/15] fix(ci): harden migration audit seed --- .gitea/workflows/run-migration.yml | 16 +++++++--------- docs/LOGBOOK.md | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/.gitea/workflows/run-migration.yml b/.gitea/workflows/run-migration.yml index 3ed3ad85..0ec25bf7 100644 --- a/.gitea/workflows/run-migration.yml +++ b/.gitea/workflows/run-migration.yml @@ -133,13 +133,15 @@ jobs: PGURL_PSQL="${PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}" OWNER_PGURL_PSQL="${OWNER_PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}" FILES_JSON=$(echo "${{ steps.diff.outputs.new_files }}" | jq -Rn '[inputs | select(length > 0)]') + SUMMARY_JSON=$(jq -cn \ + --arg commit_sha "${{ github.sha }}" \ + --argjson files "$FILES_JSON" \ + '{type: "ci_migration", commit_sha: $commit_sha, files: $files}') + SUMMARY_JSON_SQL=${SUMMARY_JSON//\'/\'\'} seed_audit() { local url="$1" - psql "$url" \ - -v ON_ERROR_STOP=1 \ - -v commit_sha="${{ github.sha }}" \ - -v files_json="$FILES_JSON" <<'SQL' + psql "$url" -v ON_ERROR_STOP=1 < Date: Wed, 13 May 2026 03:53:13 +0800 Subject: [PATCH 05/15] feat(awooop): expose ansible audit truth surface --- .../adr090d_ansible_operation_types.sql | 36 +++ .../adr090d_ansible_operation_types_down.sql | 19 ++ .../services/awooop_ansible_audit_service.py | 262 ++++++++++++++++++ .../services/awooop_truth_chain_service.py | 26 +- .../src/services/incident_timeline_service.py | 5 + .../tests/test_awooop_truth_chain_service.py | 46 +++ docs/LOGBOOK.md | 36 +++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 26 ++ 8 files changed, 449 insertions(+), 7 deletions(-) create mode 100644 apps/api/migrations/adr090d_ansible_operation_types.sql create mode 100644 apps/api/migrations/adr090d_ansible_operation_types_down.sql create mode 100644 apps/api/src/services/awooop_ansible_audit_service.py diff --git a/apps/api/migrations/adr090d_ansible_operation_types.sql b/apps/api/migrations/adr090d_ansible_operation_types.sql new file mode 100644 index 00000000..636c14c5 --- /dev/null +++ b/apps/api/migrations/adr090d_ansible_operation_types.sql @@ -0,0 +1,36 @@ +-- ADR-090-D: automation_operation_log.operation_type adds Ansible executor audit states +-- Created: 2026-05-12 Taipei +-- +-- Purpose: +-- T3 Ansible declarative executor visibility. These operation types allow +-- the AI automation truth chain to record that Ansible was matched, +-- check-mode executed, applied, rolled back, or explicitly skipped. +-- +-- Safety: +-- This migration only expands the CHECK allowlist. It does not execute +-- Ansible, change approval behavior, or create auto-remediation rows. + +ALTER TABLE automation_operation_log + DROP CONSTRAINT IF EXISTS automation_operation_log_type_valid; + +ALTER TABLE automation_operation_log + ADD CONSTRAINT automation_operation_log_type_valid CHECK (operation_type IN ( + 'monitor_configured','monitor_removed', + 'alert_fired','alert_suppressed','alert_routed', + 'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated', + 'playbook_generated','playbook_updated','playbook_executed', + 'remediation_executed','remediation_verified','remediation_rolled_back', + 'self_correction_attempted', + 'km_created','km_updated','km_linked', + 'asset_discovered','coverage_recalculated', + 'capacity_recommendation','quota_enforced', + 'notification_formatted', + 'ansible_candidate_matched', + 'ansible_check_mode_executed', + 'ansible_apply_executed', + 'ansible_rollback_executed', + 'ansible_execution_skipped' + )); + +COMMENT ON CONSTRAINT automation_operation_log_type_valid ON automation_operation_log IS + 'ADR-090-D: allow first-class Ansible executor audit states for AwoooP truth-chain visibility.'; diff --git a/apps/api/migrations/adr090d_ansible_operation_types_down.sql b/apps/api/migrations/adr090d_ansible_operation_types_down.sql new file mode 100644 index 00000000..948bdb47 --- /dev/null +++ b/apps/api/migrations/adr090d_ansible_operation_types_down.sql @@ -0,0 +1,19 @@ +-- ADR-090-D rollback: remove Ansible executor audit states from operation_type allowlist. +-- Only apply after confirming no automation_operation_log rows use ansible_* operation types. + +ALTER TABLE automation_operation_log + DROP CONSTRAINT IF EXISTS automation_operation_log_type_valid; + +ALTER TABLE automation_operation_log + ADD CONSTRAINT automation_operation_log_type_valid CHECK (operation_type IN ( + 'monitor_configured','monitor_removed', + 'alert_fired','alert_suppressed','alert_routed', + 'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated', + 'playbook_generated','playbook_updated','playbook_executed', + 'remediation_executed','remediation_verified','remediation_rolled_back', + 'self_correction_attempted', + 'km_created','km_updated','km_linked', + 'asset_discovered','coverage_recalculated', + 'capacity_recommendation','quota_enforced', + 'notification_formatted' + )); diff --git a/apps/api/src/services/awooop_ansible_audit_service.py b/apps/api/src/services/awooop_ansible_audit_service.py new file mode 100644 index 00000000..74d5b728 --- /dev/null +++ b/apps/api/src/services/awooop_ansible_audit_service.py @@ -0,0 +1,262 @@ +"""AwoooP Ansible audit helpers. + +This module is intentionally non-executing. It exposes the Ansible audit +contract and repo-known playbook catalog so the truth chain can say whether +Ansible was actually considered or executed, without pretending that catalog +hints are runtime remediation. +""" + +from __future__ import annotations + +from typing import Any + + +ANSIBLE_OPERATION_TYPES = frozenset({ + "ansible_candidate_matched", + "ansible_check_mode_executed", + "ansible_apply_executed", + "ansible_rollback_executed", + "ansible_execution_skipped", +}) + +_CATALOG: tuple[dict[str, Any], ...] = ( + { + "catalog_id": "ansible:110-devops", + "playbook_path": "infra/ansible/playbooks/110-devops.yml", + "inventory_hosts": ["host_110"], + "domains": ["swap", "harbor", "sentry", "gitea", "langfuse", "bitan", "runner", "keepalived", "nginx"], + "keywords": [ + "110", + "swap", + "harbor", + "sentry", + "gitea", + "langfuse", + "bitan", + "runner", + "github-runner", + "keepalived", + ], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:188-ai-web", + "playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "inventory_hosts": ["host_188"], + "domains": ["docker", "momo_backup", "signoz", "minio", "litellm", "n8n", "open_webui", "nginx"], + "keywords": [ + "188", + "momo", + "backup", + "postgresql", + "pg_backup", + "signoz", + "minio", + "litellm", + "n8n", + "open-webui", + "openwebui", + "docker-registry", + ], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:nginx-sync", + "playbook_path": "infra/ansible/playbooks/nginx-sync.yml", + "inventory_hosts": ["host_110", "host_188"], + "domains": ["nginx", "proxy", "ollama_proxy", "tls"], + "keywords": ["nginx", "proxy", "ollama", "gcp", "tls", "cert", "502", "upstream"], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:restore-password-auth", + "playbook_path": "infra/ansible/playbooks/restore-password-auth.yml", + "inventory_hosts": ["host_110", "host_120", "host_121", "host_188"], + "domains": ["ssh", "password_auth"], + "keywords": ["ssh", "passwordauthentication", "password auth", "login", "auth"], + "supports_check_mode": False, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "high", + }, +) + + +def _get(row: dict[str, Any], key: str) -> Any: + return row.get(key) + + +def _tags(row: dict[str, Any]) -> list[str]: + raw = _get(row, "tags") + if isinstance(raw, list): + return [str(item).lower() for item in raw] + if isinstance(raw, str): + return [part.strip().lower() for part in raw.split(",") if part.strip()] + return [] + + +def _first_present(row: dict[str, Any], keys: tuple[str, ...]) -> Any: + for key in keys: + value = _get(row, key) + if value not in (None, ""): + return value + return None + + +def _is_ansible_operation(row: dict[str, Any]) -> bool: + operation_type = str(_get(row, "operation_type") or "").lower() + if operation_type in ANSIBLE_OPERATION_TYPES: + return True + if "ansible" in _tags(row): + return True + executor = str( + _first_present( + row, + ( + "input_executor", + "input_execution_backend", + "output_executor", + "output_execution_backend", + ), + ) + or "" + ).lower() + if executor == "ansible": + return True + playbook_path = str( + _first_present(row, ("input_playbook_path", "output_playbook_path", "input_ansible_playbook_path", "output_ansible_playbook_path")) + or "" + ).lower() + return "infra/ansible/" in playbook_path or playbook_path.endswith(".yml") and "ansible" in playbook_path + + +def _ansible_record(row: dict[str, Any]) -> dict[str, Any]: + return { + "op_id": _get(row, "op_id"), + "operation_type": _get(row, "operation_type"), + "status": _get(row, "status"), + "actor": _get(row, "actor"), + "playbook_id": _first_present(row, ("input_playbook_id", "output_playbook_id")), + "playbook_path": _first_present( + row, + ("input_playbook_path", "output_playbook_path", "input_ansible_playbook_path", "output_ansible_playbook_path"), + ), + "check_mode": _first_present(row, ("input_check_mode", "output_check_mode")), + "not_used_reason": _first_present(row, ("input_not_used_reason", "output_not_used_reason")), + "dry_run_result": _get(row, "dry_run_result"), + "error": _get(row, "error"), + "duration_ms": _get(row, "duration_ms"), + "tags": _get(row, "tags"), + "created_at": _get(row, "created_at"), + } + + +def _flatten_text(value: Any, pieces: list[str], remaining: int = 80) -> int: + if remaining <= 0 or value is None: + return remaining + if isinstance(value, dict): + for key, item in value.items(): + remaining = _flatten_text(key, pieces, remaining) + remaining = _flatten_text(item, pieces, remaining) + if remaining <= 0: + break + return remaining + if isinstance(value, list): + for item in value: + remaining = _flatten_text(item, pieces, remaining) + if remaining <= 0: + break + return remaining + pieces.append(str(value).lower()) + return remaining - 1 + + +def _source_haystack(incident: dict[str, Any] | None, drift: dict[str, Any] | None) -> str: + pieces: list[str] = [] + _flatten_text(incident, pieces) + _flatten_text(drift, pieces) + return " ".join(pieces) + + +def _catalog_hints(incident: dict[str, Any] | None, drift: dict[str, Any] | None) -> dict[str, Any]: + haystack = _source_haystack(incident, drift) + candidates: list[dict[str, Any]] = [] + unmatched: list[str] = [] + for item in _CATALOG: + matched = [keyword for keyword in item["keywords"] if keyword in haystack] + public_item = { + key: value + for key, value in item.items() + if key + in { + "catalog_id", + "playbook_path", + "inventory_hosts", + "domains", + "supports_check_mode", + "auto_apply_enabled", + "approval_required", + "risk_level", + } + } + if matched: + candidates.append({ + **public_item, + "match_score": len(matched), + "matched_keywords": matched, + }) + else: + unmatched.append(item["catalog_id"]) + candidates.sort(key=lambda row: (-int(row["match_score"]), str(row["catalog_id"]))) + return { + "match_mode": "static_catalog_keyword_hint_v1", + "decision_effect": "none", + "available_count": len(_CATALOG), + "candidates": candidates, + "unmatched_catalog_ids": unmatched, + } + + +def build_ansible_truth( + automation_ops: list[dict[str, Any]], + *, + incident: dict[str, Any] | None, + drift: dict[str, Any] | None, +) -> dict[str, Any]: + """Build the truth-chain Ansible section from audited facts and catalog hints.""" + + records = [_ansible_record(row) for row in automation_ops if _is_ansible_operation(row)] + return { + "considered": bool(records), + "records": records, + "audit_contract": { + "schema_version": "ansible_executor_audit_v1", + "operation_types": sorted(ANSIBLE_OPERATION_TYPES), + "required_audit_fields": [ + "operation_type", + "status", + "actor", + "input.executor", + "input.playbook_path", + "input.check_mode", + "output.not_used_reason", + "dry_run_result", + ], + "default_execution_mode": "catalog/dry-run audit only until approval execution is explicitly wired", + }, + "candidate_catalog": _catalog_hints(incident, drift), + "not_used_reason": ( + None + if records + else "no automation_operation_log row with Ansible operation type, tag, or executor backend for this source" + ), + } diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index 78867133..ed182eb2 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -17,6 +17,7 @@ import structlog from sqlalchemy import text from src.db.base import get_db_context +from src.services.awooop_ansible_audit_service import build_ansible_truth logger = structlog.get_logger(__name__) @@ -421,15 +422,30 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ error, duration_ms, tags, + input ->> 'executor' AS input_executor, + input ->> 'execution_backend' AS input_execution_backend, + input ->> 'playbook_id' AS input_playbook_id, + input ->> 'playbook_path' AS input_playbook_path, + input ->> 'ansible_playbook_path' AS input_ansible_playbook_path, + input ->> 'check_mode' AS input_check_mode, + input ->> 'not_used_reason' AS input_not_used_reason, + output ->> 'executor' AS output_executor, + output ->> 'execution_backend' AS output_execution_backend, + output ->> 'playbook_id' AS output_playbook_id, + output ->> 'playbook_path' AS output_playbook_path, + output ->> 'ansible_playbook_path' AS output_ansible_playbook_path, + output ->> 'check_mode' AS output_check_mode, + output ->> 'not_used_reason' AS output_not_used_reason, created_at FROM automation_operation_log - WHERE coalesce(input::text, '') LIKE :needle + WHERE incident_id::text = :incident_id + OR coalesce(input::text, '') LIKE :needle OR coalesce(output::text, '') LIKE :needle OR coalesce(array_to_string(tags, ','), '') LIKE :needle ORDER BY created_at DESC LIMIT :limit """, - {"needle": f"%{incident_id}%", "limit": _MAX_ROWS}, + {"incident_id": incident_id, "needle": f"%{incident_id}%", "limit": _MAX_ROWS}, ) km_entries = await _fetch_all( db, @@ -626,11 +642,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ }, "execution": { "automation_operation_log": automation_ops, - "ansible": { - "considered": False, - "records": [], - "not_used_reason": "no first-class Ansible executor audit record in current truth chain", - }, + "ansible": build_ansible_truth(automation_ops, incident=incident, drift=drift), }, "learning": { "knowledge_entries": km_entries, diff --git a/apps/api/src/services/incident_timeline_service.py b/apps/api/src/services/incident_timeline_service.py index b14dbccc..975f496a 100644 --- a/apps/api/src/services/incident_timeline_service.py +++ b/apps/api/src/services/incident_timeline_service.py @@ -104,6 +104,11 @@ _AUTOMATION_STAGE_MAP = { "capacity_recommendation": "investigator", "quota_enforced": "safe", "notification_formatted": "safe", + "ansible_candidate_matched": "ai_router", + "ansible_check_mode_executed": "executor", + "ansible_apply_executed": "executor", + "ansible_rollback_executed": "executor", + "ansible_execution_skipped": "safe", } _AUTOMATION_STATUS_MAP = { "pending": "pending", diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index bf99813e..b872e489 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1,5 +1,6 @@ from __future__ import annotations +from src.services.awooop_ansible_audit_service import build_ansible_truth from src.services.awooop_truth_chain_service import _clean_row, _truth_status @@ -61,3 +62,48 @@ def test_truth_status_marks_repeated_pending_drift_as_human_needed() -> None: assert status["needs_human"] is True assert "drift_report_pending_without_resolution" in status["blockers"] assert "drift_ai_confidence_zero" in status["blockers"] + + +def test_ansible_truth_surfaces_audited_check_mode_record() -> None: + truth = build_ansible_truth( + [ + { + "op_id": "op-ansible-1", + "operation_type": "ansible_check_mode_executed", + "status": "dry_run", + "actor": "platform_operator", + "input_playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "input_check_mode": "true", + "dry_run_result": {"changed": 1}, + "tags": ["ansible", "check_mode"], + "created_at": "2026-05-12T22:00:00+08:00", + } + ], + incident={"incident_id": "INC-1", "alertname": "momo pg_backup failed on 188"}, + drift=None, + ) + + assert truth["considered"] is True + assert truth["not_used_reason"] is None + assert truth["records"][0]["playbook_path"] == "infra/ansible/playbooks/188-ai-web.yml" + assert truth["records"][0]["check_mode"] == "true" + assert truth["records"][0]["dry_run_result"] == {"changed": 1} + assert "ansible_check_mode_executed" in truth["audit_contract"]["operation_types"] + assert truth["candidate_catalog"]["decision_effect"] == "none" + assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:188-ai-web" + assert truth["candidate_catalog"]["candidates"][0]["auto_apply_enabled"] is False + + +def test_ansible_truth_keeps_catalog_hint_separate_from_runtime_use() -> None: + truth = build_ansible_truth( + [], + incident={"incident_id": "INC-2", "alertname": "nginx 502 upstream timeout"}, + drift=None, + ) + + assert truth["considered"] is False + assert truth["records"] == [] + assert truth["not_used_reason"].startswith("no automation_operation_log row") + assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:nginx-sync" + assert truth["candidate_catalog"]["candidates"][0]["approval_required"] is True + assert truth["candidate_catalog"]["decision_effect"] == "none" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 11cdb3c5..90400cf4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,39 @@ +## 2026-05-12 | T3 Ansible audit surface 第一段 + +**背景**:Telegram / truth-chain live audit 顯示 Ansible 目前仍只是 repo/主機部署工具,沒有出現在 AI 自動化修復鏈路的 first-class audit record;Operator 無法知道「是否被考慮、是否 dry-run、為何沒用」。 + +**修正**: +- 新增 migration `adr090d_ansible_operation_types.sql`,擴充 `automation_operation_log.operation_type`: + - `ansible_candidate_matched` + - `ansible_check_mode_executed` + - `ansible_apply_executed` + - `ansible_rollback_executed` + - `ansible_execution_skipped` +- 新增 rollback migration `adr090d_ansible_operation_types_down.sql`;`run-migration.yml` 會跳過 `_down.sql`。 +- 新增 `awooop_ansible_audit_service.py`: + - 讀取 automation ops 中的 Ansible operation type/tag/backend。 + - 暴露 repo 既有 playbook catalog hint。 + - 明確標示 `decision_effect=none`,避免把候選 playbook 當成已執行。 +- truth-chain `execution.ansible` 現在會顯示: + - `considered` 是否有真實 Ansible audit record。 + - `records`、`audit_contract`、`candidate_catalog`、`not_used_reason`。 +- `incident_timeline_service` 補 Ansible operation type → stage mapping。 + +**驗證**: +- `py_compile`:Ansible audit service / truth-chain / incident timeline / truth-chain tests 通過。 +- `ruff --select F,E9`:All checks passed。 +- `pytest apps/api/tests/test_awooop_truth_chain_service.py apps/api/tests/test_platform_router_order.py apps/api/tests/test_awooop_operator_auth.py -q`:13 passed。 +- `ruby YAML.load_file(".gitea/workflows/run-migration.yml")`:ok。 +- `git diff --check`:ok。 + +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible first-class audit contract / truth-chain 可見性完成;尚未把 approval execution path 寫入 Ansible dry-run/check-mode。 +- 下一步:推版後觀察 `run-migration`,確認新增 migration 與 audit seed 都通過。 + ## 2026-05-12 | run-migration audit seed 再修正 **背景**:Gitea `run-migration` 在 `Seed asset_discovery_run (audit)` 再次失敗: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 9292dd8e..87c14867 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1892,6 +1892,32 @@ Phase 6 完成後 --- +### 2026-05-12 晚 (台北) — T3 Ansible declarative executor audit surface 第一段 + +**範圍**: +- `automation_operation_log.operation_type` CHECK 追加 Ansible executor audit states: + `ansible_candidate_matched` / `ansible_check_mode_executed` / + `ansible_apply_executed` / `ansible_rollback_executed` / + `ansible_execution_skipped`。 +- 新增 `awooop_ansible_audit_service.py`,把 repo 既有 Ansible playbook catalog 以 + read-only 方式暴露給 truth-chain。 +- truth-chain `execution.ansible` 改為顯示: + - 是否真的有 `automation_operation_log` Ansible audit record。 + - audit contract / required fields。 + - static catalog keyword hints,且 `decision_effect=none`,避免把候選 playbook 誤判成已自動修復。 +- `incident_timeline_service` 加入 Ansible operation type stage mapping。 + +**已驗證**: +- 本地 `py_compile` / `ruff F,E9` / `git diff --check` 通過。 +- `test_awooop_truth_chain_service.py`、router order、operator auth 共 13 passed。 +- `run-migration.yml` YAML parse 通過;新增 `_down.sql` 會被既有 workflow skip 規則排除。 + +**仍未宣稱完成**: +- 這不是 Ansible 自動修復執行器接線;目前只建立 first-class audit contract 與 truth-chain 可見性。 +- 下一段需把 decision / approval execution path 在「只 dry-run/check-mode」下寫入上述 operation types,再談 apply。 + +--- + ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) **觸發**:統帥全景盤查 AI 自動化節點後,發現 Playbook 自動修復鏈路有 3 個結構性斷點。 From 49ffb5bb194ec71d606d15a5f2f6bd96d36f31f1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 03:59:22 +0800 Subject: [PATCH 06/15] fix(ci): repair migration audit json literal --- .gitea/workflows/run-migration.yml | 2 +- docs/LOGBOOK.md | 9 +++++++++ .../specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md | 5 +++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/run-migration.yml b/.gitea/workflows/run-migration.yml index 0ec25bf7..452ccde3 100644 --- a/.gitea/workflows/run-migration.yml +++ b/.gitea/workflows/run-migration.yml @@ -153,7 +153,7 @@ jobs: 'success', NOW(), NOW(), - '{\"psql\": 1, \"gitea_ci\": 1}'::jsonb, + '{"psql": 1, "gitea_ci": 1}'::jsonb, '${SUMMARY_JSON_SQL}'::jsonb ); SQL diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 90400cf4..3a2788b7 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -34,6 +34,15 @@ - T3:Ansible first-class audit contract / truth-chain 可見性完成;尚未把 approval execution path 寫入 Ansible dry-run/check-mode。 - 下一步:推版後觀察 `run-migration`,確認新增 migration 與 audit seed 都通過。 +**production push 追加**: +- Gitea `run-migration` run `1933` 顯示 migration 本體已成功: + - `adr090d_ansible_operation_types.sql` 以 owner fallback 套用成功。 +- 但 audit seed 仍失敗,這次不是 `:'commit_sha'`,而是 tools JSON literal 在 unquoted heredoc 下仍保留反斜線: + - `'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb` + - PostgreSQL 回 `invalid input syntax for type json`。 +- 已修 `.gitea/workflows/run-migration.yml`:tools JSON 改為 `'{"psql": 1, "gitea_ci": 1}'::jsonb`。 +- 因本次 migration 已套用但 audit row 失敗,需補 production `asset_discovery_run` 稽核記錄,並以下一個 migration push 驗證 workflow live gate。 + ## 2026-05-12 | run-migration audit seed 再修正 **背景**:Gitea `run-migration` 在 `Seed asset_discovery_run (audit)` 再次失敗: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 87c14867..17e34eb0 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1916,6 +1916,11 @@ Phase 6 完成後 - 這不是 Ansible 自動修復執行器接線;目前只建立 first-class audit contract 與 truth-chain 可見性。 - 下一段需把 decision / approval execution path 在「只 dry-run/check-mode」下寫入上述 operation types,再談 apply。 +**production 追加**: +- Gitea `run-migration` run `1933`:`adr090d_ansible_operation_types.sql` 已成功套用,含 owner fallback。 +- 同 run 的 `Seed asset_discovery_run (audit)` 仍失敗;新根因是 unquoted heredoc 下 tools JSON literal 還寫成 `'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb`,PostgreSQL 視為非法 JSON。 +- 後續修正:workflow tools JSON literal 改成 `'{"psql": 1, "gitea_ci": 1}'::jsonb`;仍需補寫本次 migration audit row,並用下一個 migration push 驗證 live gate。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) From 07000dae3a4119230e36db51e2bac1e2816f1a96 Mon Sep 17 00:00:00 2001 From: AWOOOI CD Date: Tue, 12 May 2026 19:59:30 +0000 Subject: [PATCH 07/15] chore(cd): deploy ca80972 [skip ci] --- k8s/awoooi-prod/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index 3fb1a864..6ecca770 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -40,7 +40,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: b4d367eeb463eccda5aec8aa9c90f19897dbd634 + newTag: ca80972dc73cb647f8fab3bf9439784c4b8eef7b - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web - newTag: b4d367eeb463eccda5aec8aa9c90f19897dbd634 + newTag: ca80972dc73cb647f8fab3bf9439784c4b8eef7b From f61747aeac11eb9b4cb3fe8a446529b7a62648d9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 04:03:48 +0800 Subject: [PATCH 08/15] docs(awooop): record t3 ansible deployment --- docs/LOGBOOK.md | 22 ++++++++++++++++--- ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 7 +++++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3a2788b7..fe8b8a75 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -31,8 +31,8 @@ - T0:Truth-chain read-only API 完成、部署、production smoke 完成。 - T1:Channel Event hardening 完成、部署、production smoke 完成。 - T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 -- T3:Ansible first-class audit contract / truth-chain 可見性完成;尚未把 approval execution path 寫入 Ansible dry-run/check-mode。 -- 下一步:推版後觀察 `run-migration`,確認新增 migration 與 audit seed 都通過。 +- T3:Ansible first-class audit contract / truth-chain 可見性完成、已部署;尚未把 approval execution path 寫入 Ansible dry-run/check-mode。 +- 下一步:T3 第二段接 decision / approval execution 的 Ansible check-mode audit row,仍不直接 apply。 **production push 追加**: - Gitea `run-migration` run `1933` 顯示 migration 本體已成功: @@ -41,7 +41,23 @@ - `'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb` - PostgreSQL 回 `invalid input syntax for type json`。 - 已修 `.gitea/workflows/run-migration.yml`:tools JSON 改為 `'{"psql": 1, "gitea_ci": 1}'::jsonb`。 -- 因本次 migration 已套用但 audit row 失敗,需補 production `asset_discovery_run` 稽核記錄,並以下一個 migration push 驗證 workflow live gate。 +- 已補 production `asset_discovery_run` repair audit row: + - `triggered_by=codex:gitea-migration-audit-repair` + - `summary.type=ci_migration_manual_repair` + - `summary.commit_sha=ca80972dc73cb647f8fab3bf9439784c4b8eef7b` +- Production DB constraint 驗證:`automation_operation_log_type_valid` 已包含全部 `ansible_*` operation types。 +- CD 部署: + - `07000dae chore(cd): deploy ca80972 [skip ci]` + - API/Web/Worker image 均為 `ca80972dc73cb647f8fab3bf9439784c4b8eef7b` + - rollout success。 +- Truth-chain smoke(B6C589): + - `truth_status=manual_required/blocked` + - `mcp_gateway_total=8` + - `execution.ansible.considered=false` + - `execution.ansible.records=0` + - `not_used_reason=no automation_operation_log row with Ansible operation type, tag, or executor backend for this source` + - `audit_contract.schema_version=ansible_executor_audit_v1` +- Caveat:下一個 migration push 仍需 live 驗證 `run-migration` audit seed 是否完全通過;本輪 workflow 修正後沒有新的 migration 觸發可重跑。 ## 2026-05-12 | run-migration audit seed 再修正 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 17e34eb0..517a1e11 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1919,7 +1919,12 @@ Phase 6 完成後 **production 追加**: - Gitea `run-migration` run `1933`:`adr090d_ansible_operation_types.sql` 已成功套用,含 owner fallback。 - 同 run 的 `Seed asset_discovery_run (audit)` 仍失敗;新根因是 unquoted heredoc 下 tools JSON literal 還寫成 `'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb`,PostgreSQL 視為非法 JSON。 -- 後續修正:workflow tools JSON literal 改成 `'{"psql": 1, "gitea_ci": 1}'::jsonb`;仍需補寫本次 migration audit row,並用下一個 migration push 驗證 live gate。 +- 後續修正:workflow tools JSON literal 改成 `'{"psql": 1, "gitea_ci": 1}'::jsonb`。 +- 已補 production `asset_discovery_run` repair audit row(`ci_migration_manual_repair` / `commit_sha=ca80972dc73cb647f8fab3bf9439784c4b8eef7b`)。 +- Production DB constraint 已確認包含全部 `ansible_*` operation types。 +- CD 已部署 `ca80972d` image,deploy marker `07000dae`;API/Web/Worker rollout success。 +- B6C589 truth-chain smoke:`manual_required/blocked`、`mcp_gateway_total=8`、`execution.ansible.considered=false`、`records=0`、not_used_reason 清楚顯示沒有 Ansible audit record。 +- 下一個 migration push 仍需驗證 `run-migration` audit seed live gate,因本輪 workflow 修正後未再新增 migration 觸發重跑。 --- From 3799e0db0d30f29fdc251197634d2fca4c2c67fd Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 04:07:23 +0800 Subject: [PATCH 09/15] feat(awooop): audit ansible decision candidates --- .../services/awooop_ansible_audit_service.py | 171 ++++++++++++++++++ apps/api/src/services/decision_manager.py | 37 ++++ .../tests/test_awooop_truth_chain_service.py | 43 ++++- docs/LOGBOOK.md | 19 ++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 9 + 5 files changed, 278 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/awooop_ansible_audit_service.py b/apps/api/src/services/awooop_ansible_audit_service.py index 74d5b728..40e629da 100644 --- a/apps/api/src/services/awooop_ansible_audit_service.py +++ b/apps/api/src/services/awooop_ansible_audit_service.py @@ -8,8 +8,16 @@ hints are runtime remediation. from __future__ import annotations +import json from typing import Any +import structlog +from sqlalchemy import text + +from src.db.base import get_db_context + +logger = structlog.get_logger(__name__) + ANSIBLE_OPERATION_TYPES = frozenset({ "ansible_candidate_matched", @@ -27,6 +35,9 @@ _CATALOG: tuple[dict[str, Any], ...] = ( "domains": ["swap", "harbor", "sentry", "gitea", "langfuse", "bitan", "runner", "keepalived", "nginx"], "keywords": [ "110", + "docker", + "container", + "dockercontainerunhealthy", "swap", "harbor", "sentry", @@ -49,6 +60,9 @@ _CATALOG: tuple[dict[str, Any], ...] = ( "domains": ["docker", "momo_backup", "signoz", "minio", "litellm", "n8n", "open_webui", "nginx"], "keywords": [ "188", + "docker", + "container", + "dockercontainerunhealthy", "momo", "backup", "postgresql", @@ -260,3 +274,160 @@ def build_ansible_truth( else "no automation_operation_log row with Ansible operation type, tag, or executor backend for this source" ), } + + +def _incident_public_dict(incident: Any) -> dict[str, Any]: + if incident is None: + return {} + if isinstance(incident, dict): + return incident + severity = getattr(incident, "severity", None) + signals_payload: list[dict[str, Any]] = [] + for signal in getattr(incident, "signals", None) or []: + signals_payload.append({ + "alert_name": getattr(signal, "alert_name", None), + "labels": getattr(signal, "labels", None) or {}, + "annotations": getattr(signal, "annotations", None) or {}, + }) + return { + "incident_id": getattr(incident, "incident_id", None), + "project_id": getattr(incident, "project_id", None), + "alertname": getattr(incident, "alertname", None), + "alert_category": getattr(incident, "alert_category", None), + "notification_type": getattr(incident, "notification_type", None), + "severity": getattr(severity, "value", severity), + "affected_services": getattr(incident, "affected_services", None) or [], + "signals": signals_payload, + } + + +def build_ansible_decision_audit_payload( + *, + incident: Any, + proposal_data: dict[str, Any], + decision_path: str, + not_used_reason: str, +) -> dict[str, Any] | None: + """Return an AOL payload when Ansible has catalog candidates for a decision.""" + + incident_payload = _incident_public_dict(incident) + hints = _catalog_hints(incident_payload, None) + candidates = hints.get("candidates") or [] + if not candidates: + return None + + incident_id = str(incident_payload.get("incident_id") or "") + input_payload = { + "incident_id": incident_id, + "executor": "ansible", + "execution_backend": "ansible", + "decision_path": decision_path, + "check_mode": True, + "apply_enabled": False, + "approval_required": True, + "candidate_catalog_schema": hints["match_mode"], + "executor_candidates": [ + { + "catalog_id": row["catalog_id"], + "playbook_path": row["playbook_path"], + "inventory_hosts": row["inventory_hosts"], + "risk_level": row["risk_level"], + "match_score": row["match_score"], + "matched_keywords": row["matched_keywords"], + } + for row in candidates[:5] + ], + "proposal_source": proposal_data.get("source", ""), + "proposal_risk_level": proposal_data.get("risk_level", ""), + "proposal_action_preview": str( + proposal_data.get("action") + or proposal_data.get("kubectl_command") + or "" + )[:240], + } + output_payload = { + "not_used_reason": not_used_reason, + "decision_effect": "audit_only", + "next_required_step": "wire approval_execution to Ansible check-mode before apply", + } + return { + "operation_type": "ansible_candidate_matched", + "status": "dry_run", + "input": input_payload, + "output": output_payload, + "dry_run_result": { + "check_mode_executed": False, + "candidate_count": len(candidates), + "reason": not_used_reason, + }, + "tags": ["ansible", "decision", "candidate", "check_mode_pending"], + } + + +async def record_ansible_decision_audit( + *, + incident: Any, + proposal_data: dict[str, Any], + decision_path: str, + not_used_reason: str, +) -> bool: + """Write a best-effort Ansible candidate audit row for one decision.""" + + payload = build_ansible_decision_audit_payload( + incident=incident, + proposal_data=proposal_data, + decision_path=decision_path, + not_used_reason=not_used_reason, + ) + if payload is None: + return False + + incident_id = payload["input"]["incident_id"] + project_id = getattr(incident, "project_id", None) or "awoooi" + try: + async with get_db_context(str(project_id)) as db: + existing = await db.execute( + text(""" + SELECT op_id + FROM automation_operation_log + WHERE operation_type = 'ansible_candidate_matched' + AND input ->> 'incident_id' = :incident_id + AND input ->> 'executor' = 'ansible' + LIMIT 1 + """), + {"incident_id": incident_id}, + ) + if existing.scalar() is not None: + return False + await db.execute( + text(""" + INSERT INTO automation_operation_log ( + operation_type, actor, status, + input, output, dry_run_result, tags + ) VALUES ( + :operation_type, + 'decision_manager', + :status, + CAST(:input AS jsonb), + CAST(:output AS jsonb), + CAST(:dry_run_result AS jsonb), + :tags + ) + """), + { + "operation_type": payload["operation_type"], + "status": payload["status"], + "input": json.dumps(payload["input"], ensure_ascii=False), + "output": json.dumps(payload["output"], ensure_ascii=False), + "dry_run_result": json.dumps(payload["dry_run_result"], ensure_ascii=False), + "tags": payload["tags"], + }, + ) + return True + except Exception as exc: + logger.warning( + "ansible_decision_audit_write_failed", + incident_id=incident_id, + error=str(exc), + ) + return False diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index c77b0af9..a5f021a7 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1790,6 +1790,25 @@ class DecisionManager: token.proposal_data["auto_approve_reason"] = auto_decision.reason_detail await self._save_token(token) + try: + from src.services.awooop_ansible_audit_service import ( + record_ansible_decision_audit as _record_ansible_decision_audit, + ) + + _fire_and_forget( + _record_ansible_decision_audit( + incident=incident, + proposal_data=token.proposal_data, + decision_path="auto_execute", + not_used_reason=( + "auto_execute selected existing executor path; " + "Ansible check-mode is not wired yet" + ), + ) + ) + except Exception as _ansible_audit_err: + logger.debug("ansible_decision_audit_schedule_error", error=str(_ansible_audit_err)) + # 觸發自動執行 (非阻塞) _fire_and_forget( self._auto_execute(incident, token) @@ -1813,6 +1832,24 @@ class DecisionManager: ), ) ) + try: + from src.services.awooop_ansible_audit_service import ( + record_ansible_decision_audit as _record_ansible_decision_audit, + ) + + _fire_and_forget( + _record_ansible_decision_audit( + incident=incident, + proposal_data=token.proposal_data, + decision_path="manual_approval", + not_used_reason=( + "manual approval required; Ansible check-mode " + "is not wired to approval execution yet" + ), + ) + ) + except Exception as _ansible_audit_err: + logger.debug("ansible_decision_audit_schedule_error", error=str(_ansible_audit_err)) _fire_and_forget( _push_decision_to_telegram(incident, token.proposal_data) ) diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index b872e489..a2d09172 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1,6 +1,11 @@ from __future__ import annotations -from src.services.awooop_ansible_audit_service import build_ansible_truth +from types import SimpleNamespace + +from src.services.awooop_ansible_audit_service import ( + build_ansible_decision_audit_payload, + build_ansible_truth, +) from src.services.awooop_truth_chain_service import _clean_row, _truth_status @@ -107,3 +112,39 @@ def test_ansible_truth_keeps_catalog_hint_separate_from_runtime_use() -> None: assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:nginx-sync" assert truth["candidate_catalog"]["candidates"][0]["approval_required"] is True assert truth["candidate_catalog"]["decision_effect"] == "none" + + +def test_ansible_decision_audit_payload_is_dry_run_only() -> None: + incident = SimpleNamespace( + incident_id="INC-DOCKER", + project_id="awoooi", + alert_category="infrastructure", + notification_type="TYPE-3", + severity=SimpleNamespace(value="P3"), + affected_services=["bitan-pharmacy-bitan-1"], + signals=[ + SimpleNamespace( + alert_name="DockerContainerUnhealthy", + labels={"alertname": "DockerContainerUnhealthy", "container": "bitan-pharmacy-bitan-1"}, + annotations={}, + ) + ], + ) + + payload = build_ansible_decision_audit_payload( + incident=incident, + proposal_data={"source": "expert_system", "risk_level": "low", "action": "NO_ACTION"}, + decision_path="manual_approval", + not_used_reason="manual approval required; Ansible check-mode is not wired yet", + ) + + assert payload is not None + assert payload["operation_type"] == "ansible_candidate_matched" + assert payload["status"] == "dry_run" + assert payload["input"]["executor"] == "ansible" + assert payload["input"]["check_mode"] is True + assert payload["input"]["apply_enabled"] is False + assert payload["input"]["approval_required"] is True + assert payload["input"]["executor_candidates"] + assert payload["output"]["decision_effect"] == "audit_only" + assert payload["dry_run_result"]["check_mode_executed"] is False diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index fe8b8a75..3ec260df 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -59,6 +59,25 @@ - `audit_contract.schema_version=ansible_executor_audit_v1` - Caveat:下一個 migration push 仍需 live 驗證 `run-migration` audit seed 是否完全通過;本輪 workflow 修正後沒有新的 migration 觸發可重跑。 +**T3 第二段本地實作**: +- `awooop_ansible_audit_service.py` 新增 decision audit payload/writer: + - 只有 static catalog 有候選 playbook 時才寫 `automation_operation_log`。 + - operation_type=`ansible_candidate_matched`。 + - status=`dry_run`。 + - `input.executor=ansible`、`check_mode=true`、`apply_enabled=false`、`approval_required=true`。 + - `output.decision_effect=audit_only`。 +- `decision_manager` 在 auto-execute / manual-approval 分支都排程 best-effort audit write: + - 不改 executor。 + - 不跑 Ansible。 + - 不阻塞決策和 Telegram。 +- Docker/container 類 incident 也會命中 Ansible catalog hint,讓 B6C589 這類事件後續新 decision 能留下 Ansible candidate audit row。 +- 本地驗證: + - `py_compile`:pass。 + - `ruff --select F,E9`:pass。 + - `pytest test_awooop_truth_chain_service.py test_platform_router_order.py test_awooop_operator_auth.py -q`:14 passed。 + - `git diff --check`:pass。 +- 待推版與 production smoke。 + ## 2026-05-12 | run-migration audit seed 再修正 **背景**:Gitea `run-migration` 在 `Seed asset_discovery_run (audit)` 再次失敗: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 517a1e11..a6778f97 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1926,6 +1926,15 @@ Phase 6 完成後 - B6C589 truth-chain smoke:`manual_required/blocked`、`mcp_gateway_total=8`、`execution.ansible.considered=false`、`records=0`、not_used_reason 清楚顯示沒有 Ansible audit record。 - 下一個 migration push 仍需驗證 `run-migration` audit seed live gate,因本輪 workflow 修正後未再新增 migration 觸發重跑。 +**T3 第二段本地追加**: +- `decision_manager` 在 auto-execute / manual-approval 分支新增 best-effort Ansible candidate audit write。 +- 僅在 catalog 有候選 playbook 時寫 `automation_operation_log`: + `operation_type=ansible_candidate_matched`、`status=dry_run`、 + `input.check_mode=true`、`input.apply_enabled=false`、 + `output.decision_effect=audit_only`。 +- 這仍不是 Ansible 執行器;它只讓 truth-chain 能看到 AI decision path 曾考慮 Ansible candidate,以及為何未進入 check-mode/apply。 +- 本地 `py_compile` / `ruff F,E9` / 14 個 truth-chain/operator/router tests 通過;待推版和 production smoke。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) From 90b9ddb7a524d8fa2416f443a0f7c4438ee04bde Mon Sep 17 00:00:00 2001 From: AWOOOI CD Date: Tue, 12 May 2026 20:12:20 +0000 Subject: [PATCH 10/15] chore(cd): deploy 3799e0d [skip ci] --- k8s/awoooi-prod/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index 6ecca770..689fc9f0 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -40,7 +40,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: ca80972dc73cb647f8fab3bf9439784c4b8eef7b + newTag: 3799e0db0d30f29fdc251197634d2fca4c2c67fd - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web - newTag: ca80972dc73cb647f8fab3bf9439784c4b8eef7b + newTag: 3799e0db0d30f29fdc251197634d2fca4c2c67fd From b0a8302dd78c2f07d76c1d689ff5ce5e229ee1ba Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 04:17:04 +0800 Subject: [PATCH 11/15] docs(awooop): record t3 decision audit deployment --- docs/LOGBOOK.md | 57 +++++++++++++++++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 10 ++++ 2 files changed, 67 insertions(+) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3ec260df..89c3765a 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,60 @@ +## 2026-05-13 | T3 Ansible decision candidate audit 已推版 + +**背景**:T3 第一段只讓 truth-chain 看得到 Ansible audit contract 與 repo playbook catalog;但 AI decision path 還不會留下「曾考慮 Ansible、但尚未進 check-mode/apply」的 first-class record。這會讓 Telegram / Operator Console 仍看不出 Ansible 是否真的被 AI 修復鏈評估過。 + +**修正**: +- `awooop_ansible_audit_service.py` 新增 decision candidate audit payload / writer。 +- `decision_manager` 在 auto-execute / manual-approval 分支排程 best-effort `ansible_candidate_matched` audit write。 +- Audit row 明確是 dry-run / audit-only: + - `status=dry_run` + - `input.executor=ansible` + - `input.check_mode=true` + - `input.apply_enabled=false` + - `input.approval_required=true` + - `output.decision_effect=audit_only` +- Docker/container 類 incident 也會命中 188 / 110 Ansible catalog hints;未來新 decision 可在 truth-chain 顯示「有候選、尚未執行 check-mode」。 + +**驗證與推版**: +- Local: + - `py_compile`:pass。 + - `ruff --select F,E9`:pass。 + - `pytest apps/api/tests/test_awooop_truth_chain_service.py apps/api/tests/test_platform_router_order.py apps/api/tests/test_awooop_operator_auth.py -q`:14 passed。 + - Tier 3 adjacent tests:133 passed, 1 existing RuntimeWarning。 + - `git diff --check`:pass。 +- Gitea: + - `3799e0db feat(awooop): audit ansible decision candidates` 已推 `gitea main`。 + - Code Review run `1936`:success。 + - CD run `1935`:success。 + - Deploy marker:`90b9ddb7 chore(cd): deploy 3799e0d [skip ci]`。 +- Production: + - API/Web/Worker image 均為 `192.168.0.110:5000/awoooi/*:3799e0db0d30f29fdc251197634d2fca4c2c67fd`。 + - K3s rollout status:API/Web/Worker success。 + - `/api/v1/health`:healthy,mock_mode=false。 + - Pure function smoke(API pod):DockerContainerUnhealthy 事件可產生 `ansible_candidate_matched` payload,`candidate_count=2`,`check_mode_executed=false`。 + - Truth-chain smoke `INC-20260512-B6C589`: + - `source_type=incident` + - `current_stage=manual_required` + - `stage_status=blocked` + - `needs_human=true` + - `execution.ansible.audit_contract.schema_version=ansible_executor_audit_v1` + - `ansible_candidates=2` + - `mcp_gateway_total=8` + - Truth-chain smoke `7f858956`: + - `source_type=drift_report` + - `current_stage=dedup_or_repeat_updated` + - `stage_status=pending` + - `needs_human=true` + - `repeat_12h=12` + - `outbound_visible=2` + +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible audit contract + decision candidate dry-run audit 完成、部署、production smoke 完成。 +- 仍未完成:Ansible 真正 check-mode executor、diff artifact、apply / rollback audit、T4 drift fingerprint FSM、T5 incident / approval / execution reconciliation、first-class MCP Gateway enforcement。 + ## 2026-05-12 | T3 Ansible audit surface 第一段 **背景**:Telegram / truth-chain live audit 顯示 Ansible 目前仍只是 repo/主機部署工具,沒有出現在 AI 自動化修復鏈路的 first-class audit record;Operator 無法知道「是否被考慮、是否 dry-run、為何沒用」。 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index a6778f97..cdf8a1ed 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1935,6 +1935,16 @@ Phase 6 完成後 - 這仍不是 Ansible 執行器;它只讓 truth-chain 能看到 AI decision path 曾考慮 Ansible candidate,以及為何未進入 check-mode/apply。 - 本地 `py_compile` / `ruff F,E9` / 14 個 truth-chain/operator/router tests 通過;待推版和 production smoke。 +**T3 第二段 production verified(2026-05-13 台北)**: +- `3799e0db feat(awooop): audit ansible decision candidates` 已推 Gitea main,Code Review run `1936` success,CD run `1935` success。 +- Deploy marker:`90b9ddb7 chore(cd): deploy 3799e0d [skip ci]`。 +- Production API/Web/Worker image 均為 `3799e0db0d30f29fdc251197634d2fca4c2c67fd`,K3s rollout success,health 200 / `mock_mode=false`。 +- API pod pure smoke:DockerContainerUnhealthy 事件可產生 `ansible_candidate_matched` audit payload,`candidate_count=2`,`check_mode_executed=false`。 +- Truth-chain smoke: + - `INC-20260512-B6C589` → `manual_required/blocked`,`mcp_gateway_total=8`,`execution.ansible.audit_contract=ansible_executor_audit_v1`,`ansible_candidates=2`。 + - `7f858956` → `dedup_or_repeat_updated/pending`,`repeat_12h=12`,`outbound_visible=2`。 +- 邊界:仍未執行 Ansible check-mode / apply / rollback;T3 目前完成的是 first-class candidate audit,而不是修復執行器。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) From 5b34877429c16c42f0f894eb4d7f0484711fde9b Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 07:36:21 +0800 Subject: [PATCH 12/15] feat(awooop): expose drift repeat fingerprint --- apps/api/src/repositories/drift_repository.py | 25 +++ .../services/awooop_truth_chain_service.py | 56 ++---- .../src/services/drift_narrator_service.py | 33 +++- apps/api/src/services/drift_repeat_state.py | 180 ++++++++++++++++++ .../tests/test_awooop_truth_chain_service.py | 88 +++++++++ 5 files changed, 339 insertions(+), 43 deletions(-) create mode 100644 apps/api/src/services/drift_repeat_state.py diff --git a/apps/api/src/repositories/drift_repository.py b/apps/api/src/repositories/drift_repository.py index a40c16b3..15124c5d 100644 --- a/apps/api/src/repositories/drift_repository.py +++ b/apps/api/src/repositories/drift_repository.py @@ -167,6 +167,31 @@ class DriftReportRepository: {"report_id": report_id, "narrative": narrative}, ) + async def get_repeat_state(self, report: DriftReport) -> dict: + """Return stable fingerprint repeat state for a drift report.""" + from src.services.drift_repeat_state import build_drift_repeat_state + + async with get_db_context() as db: + result = await db.execute( + text(""" + SELECT + report_id, + namespace, + status, + scanned_at, + created_at, + items + FROM drift_reports + WHERE namespace = :namespace + AND created_at > now() - interval '24 hours' + ORDER BY scanned_at DESC + LIMIT 200 + """), + {"namespace": report.namespace}, + ) + rows = [dict(row) for row in result.mappings().all()] + return build_drift_repeat_state(report, rows) + _drift_repo: DriftReportRepository | None = None diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index ed182eb2..91e840d6 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -18,6 +18,7 @@ from sqlalchemy import text from src.db.base import get_db_context from src.services.awooop_ansible_audit_service import build_ansible_truth +from src.services.drift_repeat_state import build_drift_repeat_state logger = structlog.get_logger(__name__) @@ -266,6 +267,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ created_at, resolved_at, interpretation, + items, narrative_text FROM drift_reports WHERE report_id = :source_id @@ -473,55 +475,27 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ "reports": [], } if drift is not None: - repeat_summary = await _fetch_one( + recent_drift_reports = await _fetch_all( db, """ SELECT - count(*) AS occurrences_12h, - min(scanned_at) AS first_scanned_at, - max(scanned_at) AS last_scanned_at + report_id, + namespace, + status, + scanned_at, + created_at, + items, + interpretation, + narrative_text FROM drift_reports - WHERE created_at > now() - interval '12 hours' + WHERE created_at > now() - interval '24 hours' AND namespace = :namespace - AND status = :status - AND high_count = :high_count - AND medium_count = :medium_count - AND info_count = :info_count - """, - { - "namespace": drift["namespace"], - "status": drift["status"], - "high_count": drift["high_count"], - "medium_count": drift["medium_count"], - "info_count": drift["info_count"], - }, - ) - repeat_reports = await _fetch_all( - db, - """ - SELECT report_id, scanned_at, created_at, status, interpretation, narrative_text - FROM drift_reports - WHERE created_at > now() - interval '12 hours' - AND namespace = :namespace - AND status = :status - AND high_count = :high_count - AND medium_count = :medium_count - AND info_count = :info_count ORDER BY scanned_at DESC - LIMIT 20 + LIMIT 200 """, - { - "namespace": drift["namespace"], - "status": drift["status"], - "high_count": drift["high_count"], - "medium_count": drift["medium_count"], - "info_count": drift["info_count"], - }, + {"namespace": drift["namespace"]}, ) - drift_repeats = { - **(repeat_summary or {}), - "reports": repeat_reports, - } + drift_repeats = build_drift_repeat_state(drift, recent_drift_reports) gateway_mcp_rows = await _fetch_all( db, diff --git a/apps/api/src/services/drift_narrator_service.py b/apps/api/src/services/drift_narrator_service.py index e09448e6..8e29651f 100644 --- a/apps/api/src/services/drift_narrator_service.py +++ b/apps/api/src/services/drift_narrator_service.py @@ -148,7 +148,13 @@ class DriftNarratorService: # 2026-04-18 B 方案: LLM 同時產 narrative + 結構化 items(取代 str()[:30]) # 2026-04-20 P0.2: 追加 recommendation(action/confidence/reason) narrative, items, recommendation = await self._generate_narrative_and_items(report, interpretation) - await self._send_telegram(report, narrative, items, recommendation) + repeat_state = None + try: + from src.repositories.drift_repository import get_drift_repository + repeat_state = await get_drift_repository().get_repeat_state(report) + except Exception as e: + logger.warning("drift_repeat_state_lookup_failed", report_id=report.report_id, error=str(e)) + await self._send_telegram(report, narrative, items, recommendation, repeat_state) # 寫入 DB narrative_text (Phase 30 ADR-067) try: @@ -643,6 +649,7 @@ class DriftNarratorService: narrative: str, items: list[dict], recommendation: dict | None = None, + repeat_state: dict | None = None, ) -> None: """ 推送 TYPE-4D Config Drift 卡片(ADR-075)+ B 方案智能摘要 @@ -654,7 +661,7 @@ class DriftNarratorService: """ from src.services.telegram_gateway import get_telegram_gateway - diff_summary = self._render_telegram_body(report, narrative, items, recommendation) + diff_summary = self._render_telegram_body(report, narrative, items, recommendation, repeat_state) try: tg = get_telegram_gateway() @@ -711,6 +718,7 @@ class DriftNarratorService: narrative: str, items: list[dict], recommendation: dict | None = None, + repeat_state: dict | None = None, ) -> str: """ 組裝 Telegram 卡片 body(B 方案格式 + P0.2 AI 推薦) @@ -741,6 +749,10 @@ class DriftNarratorService: }.get(_act, _act) lines.append(f"🎯 AI 建議:{_emoji_action} ({int(_conf * 100)}%) — {_reason}\n") + repeat_line = self._render_repeat_state(repeat_state) + if repeat_line: + lines.append(f"{repeat_line}\n") + lines.append(f"🤖 AI 研判\n{narrative}\n") # 用非 trivial + 非白名單 的實際可操作數顯示 @@ -761,6 +773,23 @@ class DriftNarratorService: return "\n".join(lines) + def _render_repeat_state(self, repeat_state: dict | None) -> str: + """Render operator-visible repeat/stage metadata for Telegram.""" + if not repeat_state: + return "" + fingerprint = str(repeat_state.get("fingerprint") or "unknown") + occurrences = int(repeat_state.get("occurrences_12h") or 0) + window_hours = int(repeat_state.get("window_hours") or 12) + stage = str(repeat_state.get("operator_stage") or "unknown") + if occurrences <= 1: + repeat_text = f"{window_hours}h 內首次出現" + else: + repeat_text = f"{window_hours}h 內第 {occurrences} 次同指紋" + return ( + "流程: drift_scanned → ai_analyzed → " + f"{stage}\n重複: {repeat_text}\n指紋: {fingerprint}" + ) + # ============================================================ # Singleton diff --git a/apps/api/src/services/drift_repeat_state.py b/apps/api/src/services/drift_repeat_state.py new file mode 100644 index 00000000..36b9ec6b --- /dev/null +++ b/apps/api/src/services/drift_repeat_state.py @@ -0,0 +1,180 @@ +"""Stable repeat identity for Config Drift reports. + +The drift scanner emits a fresh ``report_id`` for every run. Operators need a +stable identity that answers whether two reports describe the same drift, not +just whether they have the same HIGH/MEDIUM/INFO counts. +""" + +from __future__ import annotations + +import hashlib +import json +from datetime import datetime, timedelta, timezone +from typing import Any + + +SCHEMA_VERSION = "drift_repeat_state_v1" +FINGERPRINT_VERSION = "drift_fingerprint_v1" + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _enum_value(value: Any) -> Any: + return getattr(value, "value", value) + + +def _jsonable(value: Any) -> Any: + value = _enum_value(value) + if isinstance(value, dict): + return {str(k): _jsonable(v) for k, v in value.items()} + if isinstance(value, list): + return [_jsonable(v) for v in value] + if isinstance(value, tuple): + return [_jsonable(v) for v in value] + if isinstance(value, datetime): + return value.isoformat() + return value + + +def _canonical_json(value: Any) -> str: + return json.dumps( + _jsonable(value), + ensure_ascii=False, + sort_keys=True, + separators=(",", ":"), + default=str, + ) + + +def _parse_datetime(value: Any) -> datetime | None: + if value is None: + return None + if isinstance(value, datetime): + parsed = value + if parsed.tzinfo is not None: + return parsed.astimezone(timezone.utc).replace(tzinfo=None) + return parsed + if isinstance(value, str): + try: + parsed = datetime.fromisoformat(value.replace("Z", "+00:00")) + if parsed.tzinfo is not None: + return parsed.astimezone(timezone.utc).replace(tzinfo=None) + return parsed + except ValueError: + return None + return None + + +def _iso(value: Any) -> str | None: + parsed = _parse_datetime(value) + return parsed.isoformat() if parsed else None + + +def drift_item_identity(item: Any) -> dict[str, Any]: + """Return the stable fields that define one drift item.""" + return { + "resource_kind": str(_get(item, "resource_kind", "")), + "resource_name": str(_get(item, "resource_name", "")), + "namespace": str(_get(item, "namespace", "")), + "field_path": str(_get(item, "field_path", "")), + "drift_level": str(_enum_value(_get(item, "drift_level", ""))), + "git_value": _jsonable(_get(item, "git_value")), + "actual_value": _jsonable(_get(item, "actual_value")), + "is_allowlisted": bool(_get(item, "is_allowlisted", False)), + } + + +def build_drift_fingerprint(namespace: str, items: list[Any]) -> str: + """Build a deterministic fingerprint from namespace + sorted drift items.""" + identities = [drift_item_identity(item) for item in items] + identities.sort(key=_canonical_json) + payload = { + "version": FINGERPRINT_VERSION, + "namespace": namespace, + "items": identities, + } + digest = hashlib.sha256(_canonical_json(payload).encode("utf-8")).hexdigest() + return f"dfp_{digest[:16]}" + + +def _report_identity(report: Any) -> dict[str, Any]: + items = _get(report, "items", []) or [] + namespace = str(_get(report, "namespace", "")) + return { + "report_id": _get(report, "report_id"), + "namespace": namespace, + "status": str(_enum_value(_get(report, "status", ""))), + "scanned_at": _get(report, "scanned_at"), + "created_at": _get(report, "created_at"), + "fingerprint": build_drift_fingerprint(namespace, list(items)), + } + + +def build_drift_repeat_state( + report: Any, + recent_reports: list[Any], + *, + window_hours: int = 12, + max_reports: int = 20, +) -> dict[str, Any]: + """Summarize repeat state for one drift report using stable fingerprints.""" + current = _report_identity(report) + current_time = ( + _parse_datetime(current.get("scanned_at")) + or _parse_datetime(current.get("created_at")) + or datetime.now() + ) + cutoff = current_time - timedelta(hours=window_hours) + + by_id: dict[str, dict[str, Any]] = {} + for candidate in [report, *recent_reports]: + identity = _report_identity(candidate) + report_id = str(identity.get("report_id") or "") + if not report_id: + continue + candidate_time = ( + _parse_datetime(identity.get("scanned_at")) + or _parse_datetime(identity.get("created_at")) + ) + if candidate_time is not None and candidate_time < cutoff: + continue + if identity["fingerprint"] != current["fingerprint"]: + continue + by_id[report_id] = identity + + matches = sorted( + by_id.values(), + key=lambda row: ( + _parse_datetime(row.get("scanned_at")) + or _parse_datetime(row.get("created_at")) + or datetime.min + ), + ) + first = matches[0] if matches else current + last = matches[-1] if matches else current + status = current.get("status") or "unknown" + operator_stage = "pending_human" if status == "pending" else str(status) + + return { + "schema_version": SCHEMA_VERSION, + "fingerprint": current["fingerprint"], + "matching_strategy": "namespace_and_stable_items_v1", + "window_hours": window_hours, + "occurrences_12h": len(matches), + "first_scanned_at": _iso(first.get("scanned_at") or first.get("created_at")), + "last_scanned_at": _iso(last.get("scanned_at") or last.get("created_at")), + "operator_stage": operator_stage, + "reports": [ + { + "report_id": row.get("report_id"), + "scanned_at": _iso(row.get("scanned_at")), + "created_at": _iso(row.get("created_at")), + "status": row.get("status"), + } + for row in reversed(matches[-max_reports:]) + ], + } diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index a2d09172..a3097c41 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import datetime, timedelta, timezone from types import SimpleNamespace from src.services.awooop_ansible_audit_service import ( @@ -7,6 +8,10 @@ from src.services.awooop_ansible_audit_service import ( build_ansible_truth, ) from src.services.awooop_truth_chain_service import _clean_row, _truth_status +from src.services.drift_repeat_state import ( + build_drift_fingerprint, + build_drift_repeat_state, +) def test_clean_row_parses_json_text_fields_for_gateway_visibility() -> None: @@ -69,6 +74,89 @@ def test_truth_status_marks_repeated_pending_drift_as_human_needed() -> None: assert "drift_ai_confidence_zero" in status["blockers"] +def _drift_item( + *, + resource_name: str = "awoooi-api", + field_path: str = "spec.template.spec.containers[0].image", + actual_value: str = "api:hotfix", +) -> dict: + return { + "resource_kind": "Deployment", + "resource_name": resource_name, + "namespace": "awoooi-prod", + "field_path": field_path, + "git_value": "api:main", + "actual_value": actual_value, + "drift_level": "high", + "is_allowlisted": False, + } + + +def test_drift_fingerprint_is_stable_across_item_order() -> None: + item_a = _drift_item(resource_name="awoooi-api") + item_b = _drift_item( + resource_name="awoooi-worker", + field_path="spec.template.spec.serviceAccountName", + actual_value="awoooi-executor", + ) + + first = build_drift_fingerprint("awoooi-prod", [item_a, item_b]) + second = build_drift_fingerprint("awoooi-prod", [item_b, item_a]) + changed = build_drift_fingerprint( + "awoooi-prod", + [item_a, {**item_b, "actual_value": "different-service-account"}], + ) + + assert first == second + assert first.startswith("dfp_") + assert first != changed + + +def test_drift_repeat_state_counts_matching_fingerprint_only() -> None: + now = datetime(2026, 5, 13, 1, 0, tzinfo=timezone.utc) + report = { + "report_id": "drift-now", + "namespace": "awoooi-prod", + "status": "pending", + "scanned_at": now, + "created_at": now, + "items": [_drift_item()], + } + recent = [ + { + **report, + "report_id": "drift-prev", + "scanned_at": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + **report, + "report_id": "drift-different", + "scanned_at": now - timedelta(hours=2), + "created_at": now - timedelta(hours=2), + "items": [_drift_item(actual_value="api:other")], + }, + { + **report, + "report_id": "drift-old", + "scanned_at": now - timedelta(hours=13), + "created_at": now - timedelta(hours=13), + }, + ] + + repeat_state = build_drift_repeat_state(report, recent) + + assert repeat_state["schema_version"] == "drift_repeat_state_v1" + assert repeat_state["fingerprint"].startswith("dfp_") + assert repeat_state["matching_strategy"] == "namespace_and_stable_items_v1" + assert repeat_state["occurrences_12h"] == 2 + assert repeat_state["operator_stage"] == "pending_human" + assert [row["report_id"] for row in repeat_state["reports"]] == [ + "drift-now", + "drift-prev", + ] + + def test_ansible_truth_surfaces_audited_check_mode_record() -> None: truth = build_ansible_truth( [ From 3d38039b86f75d94a7bd50e9d58cef0901fc70ae Mon Sep 17 00:00:00 2001 From: AWOOOI CD Date: Wed, 13 May 2026 07:40:58 +0800 Subject: [PATCH 13/15] chore(cd): deploy 5b34877 [skip ci] --- k8s/awoooi-prod/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index 689fc9f0..5a30a1a3 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -40,7 +40,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: 3799e0db0d30f29fdc251197634d2fca4c2c67fd + newTag: 5b34877429c16c42f0f894eb4d7f0484711fde9b - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web - newTag: 3799e0db0d30f29fdc251197634d2fca4c2c67fd + newTag: 5b34877429c16c42f0f894eb4d7f0484711fde9b From 54814bc65e418724192d0ae99d64b0c624689905 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 07:52:42 +0800 Subject: [PATCH 14/15] docs(awooop): record t4 drift fingerprint deployment --- docs/LOGBOOK.md | 62 +++++++++++++++++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 13 ++++ 2 files changed, 75 insertions(+) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 89c3765a..7abd7be5 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,65 @@ +## 2026-05-13 | T4 Config Drift fingerprint repeat-state 已推版 + +**背景**:Config Drift Telegram 卡片只顯示單次 `report_id` 與 HIGH/MEDIUM/INFO 計數,Operator 無法判斷是否同一漂移一直重複、已跑到哪個流程階段、是否需要人工。舊 truth-chain repeat 只用 namespace/status/counts 分組,會把「剛好同計數但 items 不同」誤認為同一漂移。 + +**修正**: +- 新增 `drift_repeat_state.py`: + - 以 namespace + sorted drift items 建立 stable fingerprint。 + - fingerprint 只看 drift 的實際 identity,不看 report_id / 掃描時間。 + - repeat-state schema:`drift_repeat_state_v1`。 +- `awooop_truth_chain_service`: + - drift report 查詢納入 `items`。 + - repeat-state 改用 stable fingerprint,比對 24h 內候選並回傳 12h repeat window。 + - 回傳 `fingerprint`、`matching_strategy=namespace_and_stable_items_v1`、`operator_stage`、matching reports。 +- `drift_narrator_service`: + - Telegram drift card body 會追加: + - `流程: drift_scanned → ai_analyzed → pending_human` + - `重複: 12h 內第 N 次同指紋` + - `指紋: dfp_xxxxx` + - 這仍只揭露真相鏈狀態,不自動採納 / 回滾 / 忽略。 + +**驗證與推版**: +- Local: + - `py_compile`:pass。 + - `ruff --select F,E9`:pass。 + - `pytest tests/test_awooop_truth_chain_service.py tests/test_phase25_drift_detection.py tests/test_drift_interpreter_ollama_first.py tests/test_platform_router_order.py tests/test_awooop_operator_auth.py -q`:37 passed。 + - `git diff --check`:pass。 +- Gitea: + - `5b348774 feat(awooop): expose drift repeat fingerprint` 已推 `gitea main`。 + - Code Review run `1938`:success。 + - CD run `1937`:success。 + - Deploy marker:`3d38039b chore(cd): deploy 5b34877 [skip ci]`。 +- Production: + - API/Web/Worker image 均為 `5b34877429c16c42f0f894eb4d7f0484711fde9b`。 + - K3s rollout status:API/Web/Worker success。 + - `/api/v1/health`:healthy,mock_mode=false。 + - Truth-chain smoke `7f858956`: + - `source_type=drift_report` + - `current_stage=dedup_or_repeat_updated` + - `stage_status=pending` + - `needs_human=true` + - `repeat_schema=drift_repeat_state_v1` + - `fingerprint=dfp_02dc625b64784b24` + - `matching_strategy=namespace_and_stable_items_v1` + - `operator_stage=pending_human` + - `repeat_12h=2` + - `outbound_visible=2` + - Production narrator render smoke: + - `流程: drift_scanned → ai_analyzed → pending_human | 重複: 12h 內第 2 次同指紋 | 指紋: dfp_smoke1234` + +**重要校正**: +- 舊 count-based repeat 會把 `7f858956` 算成 12 次。 +- 新 stable fingerprint 顯示同一 items fingerprint 12h 內是 2 次;這代表之前的 12 次是「同計數重複候選」,不是已證明同一漂移。 + +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible audit contract + decision candidate dry-run audit 完成、部署、production smoke 完成。 +- T4:Config Drift stable fingerprint / repeat-state / Telegram stage visibility 完成、部署、production smoke 完成。 +- 仍未完成:T5 incident / approval / execution reconciliation、Ansible 真正 check-mode executor / diff / apply / rollback、first-class MCP Gateway enforcement。 + ## 2026-05-13 | T3 Ansible decision candidate audit 已推版 **背景**:T3 第一段只讓 truth-chain 看得到 Ansible audit contract 與 repo playbook catalog;但 AI decision path 還不會留下「曾考慮 Ansible、但尚未進 check-mode/apply」的 first-class record。這會讓 Telegram / Operator Console 仍看不出 Ansible 是否真的被 AI 修復鏈評估過。 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index cdf8a1ed..5b4be5f9 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1945,6 +1945,19 @@ Phase 6 完成後 - `7f858956` → `dedup_or_repeat_updated/pending`,`repeat_12h=12`,`outbound_visible=2`。 - 邊界:仍未執行 Ansible check-mode / apply / rollback;T3 目前完成的是 first-class candidate audit,而不是修復執行器。 +**T4 Config Drift fingerprint repeat-state production verified(2026-05-13 台北)**: +- `5b348774 feat(awooop): expose drift repeat fingerprint` 已推 Gitea main,Code Review run `1938` success,CD run `1937` success。 +- Deploy marker:`3d38039b chore(cd): deploy 5b34877 [skip ci]`。 +- 新增 `drift_repeat_state_v1`:以 namespace + sorted drift items 建 stable fingerprint,不再只靠 HIGH/MEDIUM/INFO counts。 +- Truth-chain drift repeat-state 現在回傳 `fingerprint`、`matching_strategy=namespace_and_stable_items_v1`、`operator_stage`、matching reports。 +- Telegram drift narrator 會在 card body 補: + - `流程: drift_scanned → ai_analyzed → pending_human` + - `重複: 12h 內第 N 次同指紋` + - `指紋: dfp_xxxxx` +- Production `7f858956` smoke:`repeat_schema=drift_repeat_state_v1`、`fingerprint=dfp_02dc625b64784b24`、`operator_stage=pending_human`、`repeat_12h=2`、`outbound_visible=2`。 +- 重要校正:舊 count-based repeat 看到 12 次,新 stable item fingerprint 證實同一漂移 fingerprint 只有 2 次;12 次只能稱為同計數候選,不能稱為同一漂移。 +- 邊界:T4 只補可觀測與重複判定,不做 auto-adopt / rollback / ignore。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) From 1003fa4246290bec2bec4cd04caae9b8221996d9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 May 2026 09:02:16 +0800 Subject: [PATCH 15/15] feat(awooop): expose incident reconciliation state --- .../services/awooop_truth_chain_service.py | 129 ++++++++++++++++++ .../tests/test_awooop_truth_chain_service.py | 56 +++++++- 2 files changed, 184 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index 91e840d6..3dab3145 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -97,6 +97,127 @@ def _operation_ids(automation_ops: list[dict[str, Any]]) -> list[str]: return [str(row["op_id"]) for row in automation_ops if row.get("op_id")] +def _build_reconciliation( + *, + incident: dict[str, Any] | None, + approvals: list[dict[str, Any]], + evidence_rows: list[dict[str, Any]], + automation_ops: list[dict[str, Any]], + timeline_events: list[dict[str, Any]], +) -> dict[str, Any]: + """Build a read-only consistency report across incident lifecycle tables.""" + if incident is None: + return { + "schema_version": "incident_reconciliation_v1", + "applicable": False, + "consistency_status": "not_applicable", + "operator_next_state": "not_applicable", + "facts": {}, + "mismatches": [], + } + + incident_status = str(incident.get("status") or "unknown").upper() + incident_closed = incident_status in {"RESOLVED", "CLOSED"} + latest_approval = approvals[0] if approvals else None + approval_status = str((latest_approval or {}).get("status") or "none").upper() + approval_action = str((latest_approval or {}).get("action") or "") + approval_resolved = bool((latest_approval or {}).get("resolved_at")) + attempted = sum(int(row.get("sensors_attempted") or 0) for row in evidence_rows) + succeeded = sum(int(row.get("sensors_succeeded") or 0) for row in evidence_rows) + executed_ops = [ + row + for row in automation_ops + if str(row.get("status") or "").lower() + in {"success", "completed", "executed"} + ] + mismatches: list[dict[str, Any]] = [] + + def add(code: str, severity: str, message: str) -> None: + mismatches.append({ + "code": code, + "severity": severity, + "message": message, + }) + + if ( + latest_approval + and not incident_closed + and (approval_resolved or approval_status in {"APPROVED", "REJECTED"}) + ): + add( + "incident_open_after_approval_resolved", + "high", + "Approval reached a terminal state while the incident is still open.", + ) + + if approval_status == "APPROVED" and not automation_ops: + add( + "approval_approved_without_execution_record", + "high", + "Approval is approved but automation_operation_log has no linked execution record.", + ) + + if ( + approval_status == "APPROVED" + and "NO_ACTION" in approval_action.upper() + and not executed_ops + ): + add( + "approval_no_action_without_execution", + "high", + "Approval resolved to NO_ACTION and no executor produced a successful operation.", + ) + + if attempted > 0 and succeeded == 0: + add( + "evidence_all_sensors_failed", + "medium", + "Evidence collection attempted sensors but none succeeded.", + ) + + if latest_approval and not timeline_events: + add( + "timeline_missing_for_approval", + "medium", + "Approval exists but timeline_events has no linked lifecycle entries.", + ) + + high_count = sum(1 for row in mismatches if row["severity"] == "high") + medium_count = sum(1 for row in mismatches if row["severity"] == "medium") + if high_count: + consistency_status = "blocked" + operator_next_state = "manual_required" + elif medium_count: + consistency_status = "degraded" + operator_next_state = "investigate" + else: + consistency_status = "consistent" + operator_next_state = "continue" + + return { + "schema_version": "incident_reconciliation_v1", + "applicable": True, + "consistency_status": consistency_status, + "operator_next_state": operator_next_state, + "facts": { + "incident_id": incident.get("incident_id"), + "incident_status": incident_status, + "incident_closed": incident_closed, + "latest_approval_id": (latest_approval or {}).get("id"), + "latest_approval_status": approval_status, + "latest_approval_action": approval_action, + "approval_resolved": approval_resolved, + "evidence_records": len(evidence_rows), + "sensors_attempted": attempted, + "sensors_succeeded": succeeded, + "automation_operation_records": len(automation_ops), + "executed_operation_records": len(executed_ops), + "timeline_events": len(timeline_events), + }, + "mismatches": mismatches, + } + + def _truth_status( *, incident: dict[str, Any] | None, @@ -573,6 +694,13 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ legacy_mcp_total=legacy_mcp_summary["total"], outbound_visible_total=len(outbound_rows), ) + reconciliation = _build_reconciliation( + incident=incident, + approvals=approvals, + evidence_rows=evidence_rows, + automation_ops=automation_ops, + timeline_events=timeline_events, + ) evidence_totals = { "records": len(evidence_rows), @@ -618,6 +746,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ "automation_operation_log": automation_ops, "ansible": build_ansible_truth(automation_ops, incident=incident, drift=drift), }, + "reconciliation": reconciliation, "learning": { "knowledge_entries": km_entries, }, diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index a3097c41..6828a563 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -7,7 +7,11 @@ from src.services.awooop_ansible_audit_service import ( build_ansible_decision_audit_payload, build_ansible_truth, ) -from src.services.awooop_truth_chain_service import _clean_row, _truth_status +from src.services.awooop_truth_chain_service import ( + _build_reconciliation, + _clean_row, + _truth_status, +) from src.services.drift_repeat_state import ( build_drift_fingerprint, build_drift_repeat_state, @@ -157,6 +161,56 @@ def test_drift_repeat_state_counts_matching_fingerprint_only() -> None: ] +def test_reconciliation_blocks_open_incident_after_no_action_approval() -> None: + reconciliation = _build_reconciliation( + incident={"incident_id": "INC-1", "status": "INVESTIGATING"}, + approvals=[ + { + "id": "approval-1", + "status": "APPROVED", + "action": "未知操作 | NO_ACTION", + "resolved_at": "2026-05-13T01:00:00+00:00", + } + ], + evidence_rows=[{"sensors_attempted": 8, "sensors_succeeded": 0}], + automation_ops=[], + timeline_events=[], + ) + + codes = {row["code"] for row in reconciliation["mismatches"]} + assert reconciliation["schema_version"] == "incident_reconciliation_v1" + assert reconciliation["consistency_status"] == "blocked" + assert reconciliation["operator_next_state"] == "manual_required" + assert reconciliation["facts"]["incident_closed"] is False + assert reconciliation["facts"]["automation_operation_records"] == 0 + assert "incident_open_after_approval_resolved" in codes + assert "approval_approved_without_execution_record" in codes + assert "approval_no_action_without_execution" in codes + assert "evidence_all_sensors_failed" in codes + assert "timeline_missing_for_approval" in codes + + +def test_reconciliation_marks_consistent_resolved_execution() -> None: + reconciliation = _build_reconciliation( + incident={"incident_id": "INC-2", "status": "RESOLVED"}, + approvals=[ + { + "id": "approval-2", + "status": "APPROVED", + "action": "restart service", + "resolved_at": "2026-05-13T01:00:00+00:00", + } + ], + evidence_rows=[{"sensors_attempted": 8, "sensors_succeeded": 7}], + automation_ops=[{"status": "success"}], + timeline_events=[{"event_type": "executor", "status": "success"}], + ) + + assert reconciliation["consistency_status"] == "consistent" + assert reconciliation["operator_next_state"] == "continue" + assert reconciliation["mismatches"] == [] + + def test_ansible_truth_surfaces_audited_check_mode_record() -> None: truth = build_ansible_truth( [