diff --git a/.gitea/workflows/run-migration.yml b/.gitea/workflows/run-migration.yml index 3ed3ad85..452ccde3 100644 --- a/.gitea/workflows/run-migration.yml +++ b/.gitea/workflows/run-migration.yml @@ -133,13 +133,15 @@ jobs: PGURL_PSQL="${PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}" OWNER_PGURL_PSQL="${OWNER_PGURL/postgresql+asyncpg:\/\//postgresql:\/\/}" FILES_JSON=$(echo "${{ steps.diff.outputs.new_files }}" | jq -Rn '[inputs | select(length > 0)]') + SUMMARY_JSON=$(jq -cn \ + --arg commit_sha "${{ github.sha }}" \ + --argjson files "$FILES_JSON" \ + '{type: "ci_migration", commit_sha: $commit_sha, files: $files}') + SUMMARY_JSON_SQL=${SUMMARY_JSON//\'/\'\'} seed_audit() { local url="$1" - psql "$url" \ - -v ON_ERROR_STOP=1 \ - -v commit_sha="${{ github.sha }}" \ - -v files_json="$FILES_JSON" <<'SQL' + psql "$url" -v ON_ERROR_STOP=1 < dict: + """Return stable fingerprint repeat state for a drift report.""" + from src.services.drift_repeat_state import build_drift_repeat_state + + async with get_db_context() as db: + result = await db.execute( + text(""" + SELECT + report_id, + namespace, + status, + scanned_at, + created_at, + items + FROM drift_reports + WHERE namespace = :namespace + AND created_at > now() - interval '24 hours' + ORDER BY scanned_at DESC + LIMIT 200 + """), + {"namespace": report.namespace}, + ) + rows = [dict(row) for row in result.mappings().all()] + return build_drift_repeat_state(report, rows) + _drift_repo: DriftReportRepository | None = None diff --git a/apps/api/src/services/awooop_ansible_audit_service.py b/apps/api/src/services/awooop_ansible_audit_service.py new file mode 100644 index 00000000..40e629da --- /dev/null +++ b/apps/api/src/services/awooop_ansible_audit_service.py @@ -0,0 +1,433 @@ +"""AwoooP Ansible audit helpers. + +This module is intentionally non-executing. It exposes the Ansible audit +contract and repo-known playbook catalog so the truth chain can say whether +Ansible was actually considered or executed, without pretending that catalog +hints are runtime remediation. +""" + +from __future__ import annotations + +import json +from typing import Any + +import structlog +from sqlalchemy import text + +from src.db.base import get_db_context + +logger = structlog.get_logger(__name__) + + +ANSIBLE_OPERATION_TYPES = frozenset({ + "ansible_candidate_matched", + "ansible_check_mode_executed", + "ansible_apply_executed", + "ansible_rollback_executed", + "ansible_execution_skipped", +}) + +_CATALOG: tuple[dict[str, Any], ...] = ( + { + "catalog_id": "ansible:110-devops", + "playbook_path": "infra/ansible/playbooks/110-devops.yml", + "inventory_hosts": ["host_110"], + "domains": ["swap", "harbor", "sentry", "gitea", "langfuse", "bitan", "runner", "keepalived", "nginx"], + "keywords": [ + "110", + "docker", + "container", + "dockercontainerunhealthy", + "swap", + "harbor", + "sentry", + "gitea", + "langfuse", + "bitan", + "runner", + "github-runner", + "keepalived", + ], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:188-ai-web", + "playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "inventory_hosts": ["host_188"], + "domains": ["docker", "momo_backup", "signoz", "minio", "litellm", "n8n", "open_webui", "nginx"], + "keywords": [ + "188", + "docker", + "container", + "dockercontainerunhealthy", + "momo", + "backup", + "postgresql", + "pg_backup", + "signoz", + "minio", + "litellm", + "n8n", + "open-webui", + "openwebui", + "docker-registry", + ], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:nginx-sync", + "playbook_path": "infra/ansible/playbooks/nginx-sync.yml", + "inventory_hosts": ["host_110", "host_188"], + "domains": ["nginx", "proxy", "ollama_proxy", "tls"], + "keywords": ["nginx", "proxy", "ollama", "gcp", "tls", "cert", "502", "upstream"], + "supports_check_mode": True, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "medium", + }, + { + "catalog_id": "ansible:restore-password-auth", + "playbook_path": "infra/ansible/playbooks/restore-password-auth.yml", + "inventory_hosts": ["host_110", "host_120", "host_121", "host_188"], + "domains": ["ssh", "password_auth"], + "keywords": ["ssh", "passwordauthentication", "password auth", "login", "auth"], + "supports_check_mode": False, + "auto_apply_enabled": False, + "approval_required": True, + "risk_level": "high", + }, +) + + +def _get(row: dict[str, Any], key: str) -> Any: + return row.get(key) + + +def _tags(row: dict[str, Any]) -> list[str]: + raw = _get(row, "tags") + if isinstance(raw, list): + return [str(item).lower() for item in raw] + if isinstance(raw, str): + return [part.strip().lower() for part in raw.split(",") if part.strip()] + return [] + + +def _first_present(row: dict[str, Any], keys: tuple[str, ...]) -> Any: + for key in keys: + value = _get(row, key) + if value not in (None, ""): + return value + return None + + +def _is_ansible_operation(row: dict[str, Any]) -> bool: + operation_type = str(_get(row, "operation_type") or "").lower() + if operation_type in ANSIBLE_OPERATION_TYPES: + return True + if "ansible" in _tags(row): + return True + executor = str( + _first_present( + row, + ( + "input_executor", + "input_execution_backend", + "output_executor", + "output_execution_backend", + ), + ) + or "" + ).lower() + if executor == "ansible": + return True + playbook_path = str( + _first_present(row, ("input_playbook_path", "output_playbook_path", "input_ansible_playbook_path", "output_ansible_playbook_path")) + or "" + ).lower() + return "infra/ansible/" in playbook_path or playbook_path.endswith(".yml") and "ansible" in playbook_path + + +def _ansible_record(row: dict[str, Any]) -> dict[str, Any]: + return { + "op_id": _get(row, "op_id"), + "operation_type": _get(row, "operation_type"), + "status": _get(row, "status"), + "actor": _get(row, "actor"), + "playbook_id": _first_present(row, ("input_playbook_id", "output_playbook_id")), + "playbook_path": _first_present( + row, + ("input_playbook_path", "output_playbook_path", "input_ansible_playbook_path", "output_ansible_playbook_path"), + ), + "check_mode": _first_present(row, ("input_check_mode", "output_check_mode")), + "not_used_reason": _first_present(row, ("input_not_used_reason", "output_not_used_reason")), + "dry_run_result": _get(row, "dry_run_result"), + "error": _get(row, "error"), + "duration_ms": _get(row, "duration_ms"), + "tags": _get(row, "tags"), + "created_at": _get(row, "created_at"), + } + + +def _flatten_text(value: Any, pieces: list[str], remaining: int = 80) -> int: + if remaining <= 0 or value is None: + return remaining + if isinstance(value, dict): + for key, item in value.items(): + remaining = _flatten_text(key, pieces, remaining) + remaining = _flatten_text(item, pieces, remaining) + if remaining <= 0: + break + return remaining + if isinstance(value, list): + for item in value: + remaining = _flatten_text(item, pieces, remaining) + if remaining <= 0: + break + return remaining + pieces.append(str(value).lower()) + return remaining - 1 + + +def _source_haystack(incident: dict[str, Any] | None, drift: dict[str, Any] | None) -> str: + pieces: list[str] = [] + _flatten_text(incident, pieces) + _flatten_text(drift, pieces) + return " ".join(pieces) + + +def _catalog_hints(incident: dict[str, Any] | None, drift: dict[str, Any] | None) -> dict[str, Any]: + haystack = _source_haystack(incident, drift) + candidates: list[dict[str, Any]] = [] + unmatched: list[str] = [] + for item in _CATALOG: + matched = [keyword for keyword in item["keywords"] if keyword in haystack] + public_item = { + key: value + for key, value in item.items() + if key + in { + "catalog_id", + "playbook_path", + "inventory_hosts", + "domains", + "supports_check_mode", + "auto_apply_enabled", + "approval_required", + "risk_level", + } + } + if matched: + candidates.append({ + **public_item, + "match_score": len(matched), + "matched_keywords": matched, + }) + else: + unmatched.append(item["catalog_id"]) + candidates.sort(key=lambda row: (-int(row["match_score"]), str(row["catalog_id"]))) + return { + "match_mode": "static_catalog_keyword_hint_v1", + "decision_effect": "none", + "available_count": len(_CATALOG), + "candidates": candidates, + "unmatched_catalog_ids": unmatched, + } + + +def build_ansible_truth( + automation_ops: list[dict[str, Any]], + *, + incident: dict[str, Any] | None, + drift: dict[str, Any] | None, +) -> dict[str, Any]: + """Build the truth-chain Ansible section from audited facts and catalog hints.""" + + records = [_ansible_record(row) for row in automation_ops if _is_ansible_operation(row)] + return { + "considered": bool(records), + "records": records, + "audit_contract": { + "schema_version": "ansible_executor_audit_v1", + "operation_types": sorted(ANSIBLE_OPERATION_TYPES), + "required_audit_fields": [ + "operation_type", + "status", + "actor", + "input.executor", + "input.playbook_path", + "input.check_mode", + "output.not_used_reason", + "dry_run_result", + ], + "default_execution_mode": "catalog/dry-run audit only until approval execution is explicitly wired", + }, + "candidate_catalog": _catalog_hints(incident, drift), + "not_used_reason": ( + None + if records + else "no automation_operation_log row with Ansible operation type, tag, or executor backend for this source" + ), + } + + +def _incident_public_dict(incident: Any) -> dict[str, Any]: + if incident is None: + return {} + if isinstance(incident, dict): + return incident + severity = getattr(incident, "severity", None) + signals_payload: list[dict[str, Any]] = [] + for signal in getattr(incident, "signals", None) or []: + signals_payload.append({ + "alert_name": getattr(signal, "alert_name", None), + "labels": getattr(signal, "labels", None) or {}, + "annotations": getattr(signal, "annotations", None) or {}, + }) + return { + "incident_id": getattr(incident, "incident_id", None), + "project_id": getattr(incident, "project_id", None), + "alertname": getattr(incident, "alertname", None), + "alert_category": getattr(incident, "alert_category", None), + "notification_type": getattr(incident, "notification_type", None), + "severity": getattr(severity, "value", severity), + "affected_services": getattr(incident, "affected_services", None) or [], + "signals": signals_payload, + } + + +def build_ansible_decision_audit_payload( + *, + incident: Any, + proposal_data: dict[str, Any], + decision_path: str, + not_used_reason: str, +) -> dict[str, Any] | None: + """Return an AOL payload when Ansible has catalog candidates for a decision.""" + + incident_payload = _incident_public_dict(incident) + hints = _catalog_hints(incident_payload, None) + candidates = hints.get("candidates") or [] + if not candidates: + return None + + incident_id = str(incident_payload.get("incident_id") or "") + input_payload = { + "incident_id": incident_id, + "executor": "ansible", + "execution_backend": "ansible", + "decision_path": decision_path, + "check_mode": True, + "apply_enabled": False, + "approval_required": True, + "candidate_catalog_schema": hints["match_mode"], + "executor_candidates": [ + { + "catalog_id": row["catalog_id"], + "playbook_path": row["playbook_path"], + "inventory_hosts": row["inventory_hosts"], + "risk_level": row["risk_level"], + "match_score": row["match_score"], + "matched_keywords": row["matched_keywords"], + } + for row in candidates[:5] + ], + "proposal_source": proposal_data.get("source", ""), + "proposal_risk_level": proposal_data.get("risk_level", ""), + "proposal_action_preview": str( + proposal_data.get("action") + or proposal_data.get("kubectl_command") + or "" + )[:240], + } + output_payload = { + "not_used_reason": not_used_reason, + "decision_effect": "audit_only", + "next_required_step": "wire approval_execution to Ansible check-mode before apply", + } + return { + "operation_type": "ansible_candidate_matched", + "status": "dry_run", + "input": input_payload, + "output": output_payload, + "dry_run_result": { + "check_mode_executed": False, + "candidate_count": len(candidates), + "reason": not_used_reason, + }, + "tags": ["ansible", "decision", "candidate", "check_mode_pending"], + } + + +async def record_ansible_decision_audit( + *, + incident: Any, + proposal_data: dict[str, Any], + decision_path: str, + not_used_reason: str, +) -> bool: + """Write a best-effort Ansible candidate audit row for one decision.""" + + payload = build_ansible_decision_audit_payload( + incident=incident, + proposal_data=proposal_data, + decision_path=decision_path, + not_used_reason=not_used_reason, + ) + if payload is None: + return False + + incident_id = payload["input"]["incident_id"] + project_id = getattr(incident, "project_id", None) or "awoooi" + try: + async with get_db_context(str(project_id)) as db: + existing = await db.execute( + text(""" + SELECT op_id + FROM automation_operation_log + WHERE operation_type = 'ansible_candidate_matched' + AND input ->> 'incident_id' = :incident_id + AND input ->> 'executor' = 'ansible' + LIMIT 1 + """), + {"incident_id": incident_id}, + ) + if existing.scalar() is not None: + return False + await db.execute( + text(""" + INSERT INTO automation_operation_log ( + operation_type, actor, status, + input, output, dry_run_result, tags + ) VALUES ( + :operation_type, + 'decision_manager', + :status, + CAST(:input AS jsonb), + CAST(:output AS jsonb), + CAST(:dry_run_result AS jsonb), + :tags + ) + """), + { + "operation_type": payload["operation_type"], + "status": payload["status"], + "input": json.dumps(payload["input"], ensure_ascii=False), + "output": json.dumps(payload["output"], ensure_ascii=False), + "dry_run_result": json.dumps(payload["dry_run_result"], ensure_ascii=False), + "tags": payload["tags"], + }, + ) + return True + except Exception as exc: + logger.warning( + "ansible_decision_audit_write_failed", + incident_id=incident_id, + error=str(exc), + ) + return False diff --git a/apps/api/src/services/awooop_truth_chain_service.py b/apps/api/src/services/awooop_truth_chain_service.py index d1e669dd..3dab3145 100644 --- a/apps/api/src/services/awooop_truth_chain_service.py +++ b/apps/api/src/services/awooop_truth_chain_service.py @@ -7,6 +7,7 @@ Telegram cards can be audited without guessing which subsystem owns the truth. from __future__ import annotations +import json from datetime import date, datetime from decimal import Decimal from typing import Any @@ -16,10 +17,13 @@ import structlog from sqlalchemy import text from src.db.base import get_db_context +from src.services.awooop_ansible_audit_service import build_ansible_truth +from src.services.drift_repeat_state import build_drift_repeat_state logger = structlog.get_logger(__name__) _MAX_ROWS = 100 +_JSON_TEXT_FIELDS = {"gate_result", "source_envelope"} def _clean(value: Any) -> Any: @@ -38,7 +42,15 @@ def _clean(value: Any) -> Any: def _clean_row(row: Any) -> dict[str, Any]: - return {key: _clean(value) for key, value in dict(row).items()} + cleaned: dict[str, Any] = {} + for key, value in dict(row).items(): + if key in _JSON_TEXT_FIELDS and isinstance(value, str): + try: + value = json.loads(value) + except json.JSONDecodeError: + pass + cleaned[key] = _clean(value) + return cleaned async def _fetch_all(db: Any, sql: str, params: dict[str, Any]) -> list[dict[str, Any]]: @@ -85,6 +97,127 @@ def _operation_ids(automation_ops: list[dict[str, Any]]) -> list[str]: return [str(row["op_id"]) for row in automation_ops if row.get("op_id")] +def _build_reconciliation( + *, + incident: dict[str, Any] | None, + approvals: list[dict[str, Any]], + evidence_rows: list[dict[str, Any]], + automation_ops: list[dict[str, Any]], + timeline_events: list[dict[str, Any]], +) -> dict[str, Any]: + """Build a read-only consistency report across incident lifecycle tables.""" + if incident is None: + return { + "schema_version": "incident_reconciliation_v1", + "applicable": False, + "consistency_status": "not_applicable", + "operator_next_state": "not_applicable", + "facts": {}, + "mismatches": [], + } + + incident_status = str(incident.get("status") or "unknown").upper() + incident_closed = incident_status in {"RESOLVED", "CLOSED"} + latest_approval = approvals[0] if approvals else None + approval_status = str((latest_approval or {}).get("status") or "none").upper() + approval_action = str((latest_approval or {}).get("action") or "") + approval_resolved = bool((latest_approval or {}).get("resolved_at")) + attempted = sum(int(row.get("sensors_attempted") or 0) for row in evidence_rows) + succeeded = sum(int(row.get("sensors_succeeded") or 0) for row in evidence_rows) + executed_ops = [ + row + for row in automation_ops + if str(row.get("status") or "").lower() + in {"success", "completed", "executed"} + ] + mismatches: list[dict[str, Any]] = [] + + def add(code: str, severity: str, message: str) -> None: + mismatches.append({ + "code": code, + "severity": severity, + "message": message, + }) + + if ( + latest_approval + and not incident_closed + and (approval_resolved or approval_status in {"APPROVED", "REJECTED"}) + ): + add( + "incident_open_after_approval_resolved", + "high", + "Approval reached a terminal state while the incident is still open.", + ) + + if approval_status == "APPROVED" and not automation_ops: + add( + "approval_approved_without_execution_record", + "high", + "Approval is approved but automation_operation_log has no linked execution record.", + ) + + if ( + approval_status == "APPROVED" + and "NO_ACTION" in approval_action.upper() + and not executed_ops + ): + add( + "approval_no_action_without_execution", + "high", + "Approval resolved to NO_ACTION and no executor produced a successful operation.", + ) + + if attempted > 0 and succeeded == 0: + add( + "evidence_all_sensors_failed", + "medium", + "Evidence collection attempted sensors but none succeeded.", + ) + + if latest_approval and not timeline_events: + add( + "timeline_missing_for_approval", + "medium", + "Approval exists but timeline_events has no linked lifecycle entries.", + ) + + high_count = sum(1 for row in mismatches if row["severity"] == "high") + medium_count = sum(1 for row in mismatches if row["severity"] == "medium") + if high_count: + consistency_status = "blocked" + operator_next_state = "manual_required" + elif medium_count: + consistency_status = "degraded" + operator_next_state = "investigate" + else: + consistency_status = "consistent" + operator_next_state = "continue" + + return { + "schema_version": "incident_reconciliation_v1", + "applicable": True, + "consistency_status": consistency_status, + "operator_next_state": operator_next_state, + "facts": { + "incident_id": incident.get("incident_id"), + "incident_status": incident_status, + "incident_closed": incident_closed, + "latest_approval_id": (latest_approval or {}).get("id"), + "latest_approval_status": approval_status, + "latest_approval_action": approval_action, + "approval_resolved": approval_resolved, + "evidence_records": len(evidence_rows), + "sensors_attempted": attempted, + "sensors_succeeded": succeeded, + "automation_operation_records": len(automation_ops), + "executed_operation_records": len(executed_ops), + "timeline_events": len(timeline_events), + }, + "mismatches": mismatches, + } + + def _truth_status( *, incident: dict[str, Any] | None, @@ -255,6 +388,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ created_at, resolved_at, interpretation, + items, narrative_text FROM drift_reports WHERE report_id = :source_id @@ -411,15 +545,30 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ error, duration_ms, tags, + input ->> 'executor' AS input_executor, + input ->> 'execution_backend' AS input_execution_backend, + input ->> 'playbook_id' AS input_playbook_id, + input ->> 'playbook_path' AS input_playbook_path, + input ->> 'ansible_playbook_path' AS input_ansible_playbook_path, + input ->> 'check_mode' AS input_check_mode, + input ->> 'not_used_reason' AS input_not_used_reason, + output ->> 'executor' AS output_executor, + output ->> 'execution_backend' AS output_execution_backend, + output ->> 'playbook_id' AS output_playbook_id, + output ->> 'playbook_path' AS output_playbook_path, + output ->> 'ansible_playbook_path' AS output_ansible_playbook_path, + output ->> 'check_mode' AS output_check_mode, + output ->> 'not_used_reason' AS output_not_used_reason, created_at FROM automation_operation_log - WHERE coalesce(input::text, '') LIKE :needle + WHERE incident_id::text = :incident_id + OR coalesce(input::text, '') LIKE :needle OR coalesce(output::text, '') LIKE :needle OR coalesce(array_to_string(tags, ','), '') LIKE :needle ORDER BY created_at DESC LIMIT :limit """, - {"needle": f"%{incident_id}%", "limit": _MAX_ROWS}, + {"incident_id": incident_id, "needle": f"%{incident_id}%", "limit": _MAX_ROWS}, ) km_entries = await _fetch_all( db, @@ -447,55 +596,27 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ "reports": [], } if drift is not None: - repeat_summary = await _fetch_one( + recent_drift_reports = await _fetch_all( db, """ SELECT - count(*) AS occurrences_12h, - min(scanned_at) AS first_scanned_at, - max(scanned_at) AS last_scanned_at + report_id, + namespace, + status, + scanned_at, + created_at, + items, + interpretation, + narrative_text FROM drift_reports - WHERE created_at > now() - interval '12 hours' + WHERE created_at > now() - interval '24 hours' AND namespace = :namespace - AND status = :status - AND high_count = :high_count - AND medium_count = :medium_count - AND info_count = :info_count - """, - { - "namespace": drift["namespace"], - "status": drift["status"], - "high_count": drift["high_count"], - "medium_count": drift["medium_count"], - "info_count": drift["info_count"], - }, - ) - repeat_reports = await _fetch_all( - db, - """ - SELECT report_id, scanned_at, created_at, status, interpretation, narrative_text - FROM drift_reports - WHERE created_at > now() - interval '12 hours' - AND namespace = :namespace - AND status = :status - AND high_count = :high_count - AND medium_count = :medium_count - AND info_count = :info_count ORDER BY scanned_at DESC - LIMIT 20 + LIMIT 200 """, - { - "namespace": drift["namespace"], - "status": drift["status"], - "high_count": drift["high_count"], - "medium_count": drift["medium_count"], - "info_count": drift["info_count"], - }, + {"namespace": drift["namespace"]}, ) - drift_repeats = { - **(repeat_summary or {}), - "reports": repeat_reports, - } + drift_repeats = build_drift_repeat_state(drift, recent_drift_reports) gateway_mcp_rows = await _fetch_all( db, @@ -507,6 +628,7 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ trace_id, agent_id, tool_name, + gate_result, result_status, block_gate, block_reason, @@ -572,6 +694,13 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ legacy_mcp_total=legacy_mcp_summary["total"], outbound_visible_total=len(outbound_rows), ) + reconciliation = _build_reconciliation( + incident=incident, + approvals=approvals, + evidence_rows=evidence_rows, + automation_ops=automation_ops, + timeline_events=timeline_events, + ) evidence_totals = { "records": len(evidence_rows), @@ -615,12 +744,9 @@ async def fetch_truth_chain(source_id: str, project_id: str = "awoooi") -> dict[ }, "execution": { "automation_operation_log": automation_ops, - "ansible": { - "considered": False, - "records": [], - "not_used_reason": "no first-class Ansible executor audit record in current truth chain", - }, + "ansible": build_ansible_truth(automation_ops, incident=incident, drift=drift), }, + "reconciliation": reconciliation, "learning": { "knowledge_entries": km_entries, }, diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index c77b0af9..a5f021a7 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1790,6 +1790,25 @@ class DecisionManager: token.proposal_data["auto_approve_reason"] = auto_decision.reason_detail await self._save_token(token) + try: + from src.services.awooop_ansible_audit_service import ( + record_ansible_decision_audit as _record_ansible_decision_audit, + ) + + _fire_and_forget( + _record_ansible_decision_audit( + incident=incident, + proposal_data=token.proposal_data, + decision_path="auto_execute", + not_used_reason=( + "auto_execute selected existing executor path; " + "Ansible check-mode is not wired yet" + ), + ) + ) + except Exception as _ansible_audit_err: + logger.debug("ansible_decision_audit_schedule_error", error=str(_ansible_audit_err)) + # 觸發自動執行 (非阻塞) _fire_and_forget( self._auto_execute(incident, token) @@ -1813,6 +1832,24 @@ class DecisionManager: ), ) ) + try: + from src.services.awooop_ansible_audit_service import ( + record_ansible_decision_audit as _record_ansible_decision_audit, + ) + + _fire_and_forget( + _record_ansible_decision_audit( + incident=incident, + proposal_data=token.proposal_data, + decision_path="manual_approval", + not_used_reason=( + "manual approval required; Ansible check-mode " + "is not wired to approval execution yet" + ), + ) + ) + except Exception as _ansible_audit_err: + logger.debug("ansible_decision_audit_schedule_error", error=str(_ansible_audit_err)) _fire_and_forget( _push_decision_to_telegram(incident, token.proposal_data) ) diff --git a/apps/api/src/services/drift_narrator_service.py b/apps/api/src/services/drift_narrator_service.py index e09448e6..8e29651f 100644 --- a/apps/api/src/services/drift_narrator_service.py +++ b/apps/api/src/services/drift_narrator_service.py @@ -148,7 +148,13 @@ class DriftNarratorService: # 2026-04-18 B 方案: LLM 同時產 narrative + 結構化 items(取代 str()[:30]) # 2026-04-20 P0.2: 追加 recommendation(action/confidence/reason) narrative, items, recommendation = await self._generate_narrative_and_items(report, interpretation) - await self._send_telegram(report, narrative, items, recommendation) + repeat_state = None + try: + from src.repositories.drift_repository import get_drift_repository + repeat_state = await get_drift_repository().get_repeat_state(report) + except Exception as e: + logger.warning("drift_repeat_state_lookup_failed", report_id=report.report_id, error=str(e)) + await self._send_telegram(report, narrative, items, recommendation, repeat_state) # 寫入 DB narrative_text (Phase 30 ADR-067) try: @@ -643,6 +649,7 @@ class DriftNarratorService: narrative: str, items: list[dict], recommendation: dict | None = None, + repeat_state: dict | None = None, ) -> None: """ 推送 TYPE-4D Config Drift 卡片(ADR-075)+ B 方案智能摘要 @@ -654,7 +661,7 @@ class DriftNarratorService: """ from src.services.telegram_gateway import get_telegram_gateway - diff_summary = self._render_telegram_body(report, narrative, items, recommendation) + diff_summary = self._render_telegram_body(report, narrative, items, recommendation, repeat_state) try: tg = get_telegram_gateway() @@ -711,6 +718,7 @@ class DriftNarratorService: narrative: str, items: list[dict], recommendation: dict | None = None, + repeat_state: dict | None = None, ) -> str: """ 組裝 Telegram 卡片 body(B 方案格式 + P0.2 AI 推薦) @@ -741,6 +749,10 @@ class DriftNarratorService: }.get(_act, _act) lines.append(f"🎯 AI 建議:{_emoji_action} ({int(_conf * 100)}%) — {_reason}\n") + repeat_line = self._render_repeat_state(repeat_state) + if repeat_line: + lines.append(f"{repeat_line}\n") + lines.append(f"🤖 AI 研判\n{narrative}\n") # 用非 trivial + 非白名單 的實際可操作數顯示 @@ -761,6 +773,23 @@ class DriftNarratorService: return "\n".join(lines) + def _render_repeat_state(self, repeat_state: dict | None) -> str: + """Render operator-visible repeat/stage metadata for Telegram.""" + if not repeat_state: + return "" + fingerprint = str(repeat_state.get("fingerprint") or "unknown") + occurrences = int(repeat_state.get("occurrences_12h") or 0) + window_hours = int(repeat_state.get("window_hours") or 12) + stage = str(repeat_state.get("operator_stage") or "unknown") + if occurrences <= 1: + repeat_text = f"{window_hours}h 內首次出現" + else: + repeat_text = f"{window_hours}h 內第 {occurrences} 次同指紋" + return ( + "流程: drift_scanned → ai_analyzed → " + f"{stage}\n重複: {repeat_text}\n指紋: {fingerprint}" + ) + # ============================================================ # Singleton diff --git a/apps/api/src/services/drift_repeat_state.py b/apps/api/src/services/drift_repeat_state.py new file mode 100644 index 00000000..36b9ec6b --- /dev/null +++ b/apps/api/src/services/drift_repeat_state.py @@ -0,0 +1,180 @@ +"""Stable repeat identity for Config Drift reports. + +The drift scanner emits a fresh ``report_id`` for every run. Operators need a +stable identity that answers whether two reports describe the same drift, not +just whether they have the same HIGH/MEDIUM/INFO counts. +""" + +from __future__ import annotations + +import hashlib +import json +from datetime import datetime, timedelta, timezone +from typing import Any + + +SCHEMA_VERSION = "drift_repeat_state_v1" +FINGERPRINT_VERSION = "drift_fingerprint_v1" + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _enum_value(value: Any) -> Any: + return getattr(value, "value", value) + + +def _jsonable(value: Any) -> Any: + value = _enum_value(value) + if isinstance(value, dict): + return {str(k): _jsonable(v) for k, v in value.items()} + if isinstance(value, list): + return [_jsonable(v) for v in value] + if isinstance(value, tuple): + return [_jsonable(v) for v in value] + if isinstance(value, datetime): + return value.isoformat() + return value + + +def _canonical_json(value: Any) -> str: + return json.dumps( + _jsonable(value), + ensure_ascii=False, + sort_keys=True, + separators=(",", ":"), + default=str, + ) + + +def _parse_datetime(value: Any) -> datetime | None: + if value is None: + return None + if isinstance(value, datetime): + parsed = value + if parsed.tzinfo is not None: + return parsed.astimezone(timezone.utc).replace(tzinfo=None) + return parsed + if isinstance(value, str): + try: + parsed = datetime.fromisoformat(value.replace("Z", "+00:00")) + if parsed.tzinfo is not None: + return parsed.astimezone(timezone.utc).replace(tzinfo=None) + return parsed + except ValueError: + return None + return None + + +def _iso(value: Any) -> str | None: + parsed = _parse_datetime(value) + return parsed.isoformat() if parsed else None + + +def drift_item_identity(item: Any) -> dict[str, Any]: + """Return the stable fields that define one drift item.""" + return { + "resource_kind": str(_get(item, "resource_kind", "")), + "resource_name": str(_get(item, "resource_name", "")), + "namespace": str(_get(item, "namespace", "")), + "field_path": str(_get(item, "field_path", "")), + "drift_level": str(_enum_value(_get(item, "drift_level", ""))), + "git_value": _jsonable(_get(item, "git_value")), + "actual_value": _jsonable(_get(item, "actual_value")), + "is_allowlisted": bool(_get(item, "is_allowlisted", False)), + } + + +def build_drift_fingerprint(namespace: str, items: list[Any]) -> str: + """Build a deterministic fingerprint from namespace + sorted drift items.""" + identities = [drift_item_identity(item) for item in items] + identities.sort(key=_canonical_json) + payload = { + "version": FINGERPRINT_VERSION, + "namespace": namespace, + "items": identities, + } + digest = hashlib.sha256(_canonical_json(payload).encode("utf-8")).hexdigest() + return f"dfp_{digest[:16]}" + + +def _report_identity(report: Any) -> dict[str, Any]: + items = _get(report, "items", []) or [] + namespace = str(_get(report, "namespace", "")) + return { + "report_id": _get(report, "report_id"), + "namespace": namespace, + "status": str(_enum_value(_get(report, "status", ""))), + "scanned_at": _get(report, "scanned_at"), + "created_at": _get(report, "created_at"), + "fingerprint": build_drift_fingerprint(namespace, list(items)), + } + + +def build_drift_repeat_state( + report: Any, + recent_reports: list[Any], + *, + window_hours: int = 12, + max_reports: int = 20, +) -> dict[str, Any]: + """Summarize repeat state for one drift report using stable fingerprints.""" + current = _report_identity(report) + current_time = ( + _parse_datetime(current.get("scanned_at")) + or _parse_datetime(current.get("created_at")) + or datetime.now() + ) + cutoff = current_time - timedelta(hours=window_hours) + + by_id: dict[str, dict[str, Any]] = {} + for candidate in [report, *recent_reports]: + identity = _report_identity(candidate) + report_id = str(identity.get("report_id") or "") + if not report_id: + continue + candidate_time = ( + _parse_datetime(identity.get("scanned_at")) + or _parse_datetime(identity.get("created_at")) + ) + if candidate_time is not None and candidate_time < cutoff: + continue + if identity["fingerprint"] != current["fingerprint"]: + continue + by_id[report_id] = identity + + matches = sorted( + by_id.values(), + key=lambda row: ( + _parse_datetime(row.get("scanned_at")) + or _parse_datetime(row.get("created_at")) + or datetime.min + ), + ) + first = matches[0] if matches else current + last = matches[-1] if matches else current + status = current.get("status") or "unknown" + operator_stage = "pending_human" if status == "pending" else str(status) + + return { + "schema_version": SCHEMA_VERSION, + "fingerprint": current["fingerprint"], + "matching_strategy": "namespace_and_stable_items_v1", + "window_hours": window_hours, + "occurrences_12h": len(matches), + "first_scanned_at": _iso(first.get("scanned_at") or first.get("created_at")), + "last_scanned_at": _iso(last.get("scanned_at") or last.get("created_at")), + "operator_stage": operator_stage, + "reports": [ + { + "report_id": row.get("report_id"), + "scanned_at": _iso(row.get("scanned_at")), + "created_at": _iso(row.get("created_at")), + "status": row.get("status"), + } + for row in reversed(matches[-max_reports:]) + ], + } diff --git a/apps/api/src/services/incident_timeline_service.py b/apps/api/src/services/incident_timeline_service.py index b14dbccc..975f496a 100644 --- a/apps/api/src/services/incident_timeline_service.py +++ b/apps/api/src/services/incident_timeline_service.py @@ -104,6 +104,11 @@ _AUTOMATION_STAGE_MAP = { "capacity_recommendation": "investigator", "quota_enforced": "safe", "notification_formatted": "safe", + "ansible_candidate_matched": "ai_router", + "ansible_check_mode_executed": "executor", + "ansible_apply_executed": "executor", + "ansible_rollback_executed": "executor", + "ansible_execution_skipped": "safe", } _AUTOMATION_STATUS_MAP = { "pending": "pending", diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index 4c308732..6828a563 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -1,6 +1,36 @@ from __future__ import annotations -from src.services.awooop_truth_chain_service import _truth_status +from datetime import datetime, timedelta, timezone +from types import SimpleNamespace + +from src.services.awooop_ansible_audit_service import ( + build_ansible_decision_audit_payload, + build_ansible_truth, +) +from src.services.awooop_truth_chain_service import ( + _build_reconciliation, + _clean_row, + _truth_status, +) +from src.services.drift_repeat_state import ( + build_drift_fingerprint, + build_drift_repeat_state, +) + + +def test_clean_row_parses_json_text_fields_for_gateway_visibility() -> None: + row = { + "gate_result": '{"schema_version":"legacy_mcp_bridge_v1","policy_enforced":false}', + "source_envelope": '{"adapter":"legacy_telegram_gateway"}', + "plain_text": '{"not":"parsed"}', + } + + cleaned = _clean_row(row) + + assert cleaned["gate_result"]["schema_version"] == "legacy_mcp_bridge_v1" + assert cleaned["gate_result"]["policy_enforced"] is False + assert cleaned["source_envelope"]["adapter"] == "legacy_telegram_gateway" + assert cleaned["plain_text"] == '{"not":"parsed"}' def test_truth_status_marks_no_action_approval_as_manual_required() -> None: @@ -46,3 +76,217 @@ def test_truth_status_marks_repeated_pending_drift_as_human_needed() -> None: assert status["needs_human"] is True assert "drift_report_pending_without_resolution" in status["blockers"] assert "drift_ai_confidence_zero" in status["blockers"] + + +def _drift_item( + *, + resource_name: str = "awoooi-api", + field_path: str = "spec.template.spec.containers[0].image", + actual_value: str = "api:hotfix", +) -> dict: + return { + "resource_kind": "Deployment", + "resource_name": resource_name, + "namespace": "awoooi-prod", + "field_path": field_path, + "git_value": "api:main", + "actual_value": actual_value, + "drift_level": "high", + "is_allowlisted": False, + } + + +def test_drift_fingerprint_is_stable_across_item_order() -> None: + item_a = _drift_item(resource_name="awoooi-api") + item_b = _drift_item( + resource_name="awoooi-worker", + field_path="spec.template.spec.serviceAccountName", + actual_value="awoooi-executor", + ) + + first = build_drift_fingerprint("awoooi-prod", [item_a, item_b]) + second = build_drift_fingerprint("awoooi-prod", [item_b, item_a]) + changed = build_drift_fingerprint( + "awoooi-prod", + [item_a, {**item_b, "actual_value": "different-service-account"}], + ) + + assert first == second + assert first.startswith("dfp_") + assert first != changed + + +def test_drift_repeat_state_counts_matching_fingerprint_only() -> None: + now = datetime(2026, 5, 13, 1, 0, tzinfo=timezone.utc) + report = { + "report_id": "drift-now", + "namespace": "awoooi-prod", + "status": "pending", + "scanned_at": now, + "created_at": now, + "items": [_drift_item()], + } + recent = [ + { + **report, + "report_id": "drift-prev", + "scanned_at": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + **report, + "report_id": "drift-different", + "scanned_at": now - timedelta(hours=2), + "created_at": now - timedelta(hours=2), + "items": [_drift_item(actual_value="api:other")], + }, + { + **report, + "report_id": "drift-old", + "scanned_at": now - timedelta(hours=13), + "created_at": now - timedelta(hours=13), + }, + ] + + repeat_state = build_drift_repeat_state(report, recent) + + assert repeat_state["schema_version"] == "drift_repeat_state_v1" + assert repeat_state["fingerprint"].startswith("dfp_") + assert repeat_state["matching_strategy"] == "namespace_and_stable_items_v1" + assert repeat_state["occurrences_12h"] == 2 + assert repeat_state["operator_stage"] == "pending_human" + assert [row["report_id"] for row in repeat_state["reports"]] == [ + "drift-now", + "drift-prev", + ] + + +def test_reconciliation_blocks_open_incident_after_no_action_approval() -> None: + reconciliation = _build_reconciliation( + incident={"incident_id": "INC-1", "status": "INVESTIGATING"}, + approvals=[ + { + "id": "approval-1", + "status": "APPROVED", + "action": "未知操作 | NO_ACTION", + "resolved_at": "2026-05-13T01:00:00+00:00", + } + ], + evidence_rows=[{"sensors_attempted": 8, "sensors_succeeded": 0}], + automation_ops=[], + timeline_events=[], + ) + + codes = {row["code"] for row in reconciliation["mismatches"]} + assert reconciliation["schema_version"] == "incident_reconciliation_v1" + assert reconciliation["consistency_status"] == "blocked" + assert reconciliation["operator_next_state"] == "manual_required" + assert reconciliation["facts"]["incident_closed"] is False + assert reconciliation["facts"]["automation_operation_records"] == 0 + assert "incident_open_after_approval_resolved" in codes + assert "approval_approved_without_execution_record" in codes + assert "approval_no_action_without_execution" in codes + assert "evidence_all_sensors_failed" in codes + assert "timeline_missing_for_approval" in codes + + +def test_reconciliation_marks_consistent_resolved_execution() -> None: + reconciliation = _build_reconciliation( + incident={"incident_id": "INC-2", "status": "RESOLVED"}, + approvals=[ + { + "id": "approval-2", + "status": "APPROVED", + "action": "restart service", + "resolved_at": "2026-05-13T01:00:00+00:00", + } + ], + evidence_rows=[{"sensors_attempted": 8, "sensors_succeeded": 7}], + automation_ops=[{"status": "success"}], + timeline_events=[{"event_type": "executor", "status": "success"}], + ) + + assert reconciliation["consistency_status"] == "consistent" + assert reconciliation["operator_next_state"] == "continue" + assert reconciliation["mismatches"] == [] + + +def test_ansible_truth_surfaces_audited_check_mode_record() -> None: + truth = build_ansible_truth( + [ + { + "op_id": "op-ansible-1", + "operation_type": "ansible_check_mode_executed", + "status": "dry_run", + "actor": "platform_operator", + "input_playbook_path": "infra/ansible/playbooks/188-ai-web.yml", + "input_check_mode": "true", + "dry_run_result": {"changed": 1}, + "tags": ["ansible", "check_mode"], + "created_at": "2026-05-12T22:00:00+08:00", + } + ], + incident={"incident_id": "INC-1", "alertname": "momo pg_backup failed on 188"}, + drift=None, + ) + + assert truth["considered"] is True + assert truth["not_used_reason"] is None + assert truth["records"][0]["playbook_path"] == "infra/ansible/playbooks/188-ai-web.yml" + assert truth["records"][0]["check_mode"] == "true" + assert truth["records"][0]["dry_run_result"] == {"changed": 1} + assert "ansible_check_mode_executed" in truth["audit_contract"]["operation_types"] + assert truth["candidate_catalog"]["decision_effect"] == "none" + assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:188-ai-web" + assert truth["candidate_catalog"]["candidates"][0]["auto_apply_enabled"] is False + + +def test_ansible_truth_keeps_catalog_hint_separate_from_runtime_use() -> None: + truth = build_ansible_truth( + [], + incident={"incident_id": "INC-2", "alertname": "nginx 502 upstream timeout"}, + drift=None, + ) + + assert truth["considered"] is False + assert truth["records"] == [] + assert truth["not_used_reason"].startswith("no automation_operation_log row") + assert truth["candidate_catalog"]["candidates"][0]["catalog_id"] == "ansible:nginx-sync" + assert truth["candidate_catalog"]["candidates"][0]["approval_required"] is True + assert truth["candidate_catalog"]["decision_effect"] == "none" + + +def test_ansible_decision_audit_payload_is_dry_run_only() -> None: + incident = SimpleNamespace( + incident_id="INC-DOCKER", + project_id="awoooi", + alert_category="infrastructure", + notification_type="TYPE-3", + severity=SimpleNamespace(value="P3"), + affected_services=["bitan-pharmacy-bitan-1"], + signals=[ + SimpleNamespace( + alert_name="DockerContainerUnhealthy", + labels={"alertname": "DockerContainerUnhealthy", "container": "bitan-pharmacy-bitan-1"}, + annotations={}, + ) + ], + ) + + payload = build_ansible_decision_audit_payload( + incident=incident, + proposal_data={"source": "expert_system", "risk_level": "low", "action": "NO_ACTION"}, + decision_path="manual_approval", + not_used_reason="manual approval required; Ansible check-mode is not wired yet", + ) + + assert payload is not None + assert payload["operation_type"] == "ansible_candidate_matched" + assert payload["status"] == "dry_run" + assert payload["input"]["executor"] == "ansible" + assert payload["input"]["check_mode"] is True + assert payload["input"]["apply_enabled"] is False + assert payload["input"]["approval_required"] is True + assert payload["input"]["executor_candidates"] + assert payload["output"]["decision_effect"] == "audit_only" + assert payload["dry_run_result"]["check_mode_executed"] is False diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index dc947de9..0923e6e7 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,98 +1,251 @@ -## 2026-05-12 | Source Control Draft Reconcile Plan 草案 +## 2026-05-13 | T4 Config Drift fingerprint repeat-state 已推版 -**背景**:統帥批准繼續推進後,本輪先同步最新 `gitea/main`,納入另一個 AwoooP Session 的 `legacy mcp audit -> gateway timeline` 進度,避免雙 Session 分歧。同步後繼續沿用低摩擦原則,只針對 refs-blocked repo 產生草案,不執行同步。 +**背景**:Config Drift Telegram 卡片只顯示單次 `report_id` 與 HIGH/MEDIUM/INFO 計數,Operator 無法判斷是否同一漂移一直重複、已跑到哪個流程階段、是否需要人工。舊 truth-chain repeat 只用 namespace/status/counts 分組,會把「剛好同計數但 items 不同」誤認為同一漂移。 -**本次交付**: -- 新增 `scripts/security/source-control-reconcile-plan.py`,只讀既有 redacted snapshot,不呼叫遠端 Git,不 fetch、不 push、不改 remote。 -- 新增 `docs/schemas/source_control_reconcile_plan_v1.schema.json`。 -- 產出 `docs/security/source-control-reconcile-plan.snapshot.json` 與 `docs/security/SOURCE-CONTROL-RECONCILE-PLAN.md`。 -- Draft plan 涵蓋 3 個 refs-blocked mapped repos:`wooo/awoooi`、`wooo/clawbot-v5`、`wooo/wooo-aiops`。 -- 更新 `SECURITY-SUPPLY-CHAIN-CONTRACT-MANIFEST`,contract count 從 13 增至 14,新增 `source_control_reconcile_plan_v1`。 -- 更新 `SECURITY-SUPPLY-CHAIN-PROGRESS` 與 `AWOOOP-MIRROR-ONLY-CONSUMPTION-CHECKLIST`,讓 AwoooP 可 mirror draft plan 但不得執行 refs sync。 +**修正**: +- 新增 `drift_repeat_state.py`: + - 以 namespace + sorted drift items 建立 stable fingerprint。 + - fingerprint 只看 drift 的實際 identity,不看 report_id / 掃描時間。 + - repeat-state schema:`drift_repeat_state_v1`。 +- `awooop_truth_chain_service`: + - drift report 查詢納入 `items`。 + - repeat-state 改用 stable fingerprint,比對 24h 內候選並回傳 12h repeat window。 + - 回傳 `fingerprint`、`matching_strategy=namespace_and_stable_items_v1`、`operator_stage`、matching reports。 +- `drift_narrator_service`: + - Telegram drift card body 會追加: + - `流程: drift_scanned → ai_analyzed → pending_human` + - `重複: 12h 內第 N 次同指紋` + - `指紋: dfp_xxxxx` + - 這仍只揭露真相鏈狀態,不自動採納 / 回滾 / 忽略。 -**邊界**: -- Plan 狀態為 `draft_blocked`;authenticated / admin_export server-side inventory 尚未完成前,不可執行。 -- 未 push refs、未 force push、未刪 refs、未建立 GitHub repo、未改 visibility、未切 GitHub primary、未部署。 -- 人工批准未來也必須單一 repo 生效,不得批次套用到所有 repo。 +**驗證與推版**: +- Local: + - `py_compile`:pass。 + - `ruff --select F,E9`:pass。 + - `pytest tests/test_awooop_truth_chain_service.py tests/test_phase25_drift_detection.py tests/test_drift_interpreter_ollama_first.py tests/test_platform_router_order.py tests/test_awooop_operator_auth.py -q`:37 passed。 + - `git diff --check`:pass。 +- Gitea: + - `5b348774 feat(awooop): expose drift repeat fingerprint` 已推 `gitea main`。 + - Code Review run `1938`:success。 + - CD run `1937`:success。 + - Deploy marker:`3d38039b chore(cd): deploy 5b34877 [skip ci]`。 +- Production: + - API/Web/Worker image 均為 `5b34877429c16c42f0f894eb4d7f0484711fde9b`。 + - K3s rollout status:API/Web/Worker success。 + - `/api/v1/health`:healthy,mock_mode=false。 + - Truth-chain smoke `7f858956`: + - `source_type=drift_report` + - `current_stage=dedup_or_repeat_updated` + - `stage_status=pending` + - `needs_human=true` + - `repeat_schema=drift_repeat_state_v1` + - `fingerprint=dfp_02dc625b64784b24` + - `matching_strategy=namespace_and_stable_items_v1` + - `operator_stage=pending_human` + - `repeat_12h=2` + - `outbound_visible=2` + - Production narrator render smoke: + - `流程: drift_scanned → ai_analyzed → pending_human | 重複: 12h 內第 2 次同指紋 | 指紋: dfp_smoke1234` -**驗證**: -- `source-control-reconcile-plan.py` 產生 3 plans。 -- JSON / schema / snapshot parse 通過。 -- `scripts/security/*.py` 可編譯。 -- `git diff --check` 通過。 -- PR diff added lines 未命中本輪敏感 token / credential pattern。 +**重要校正**: +- 舊 count-based repeat 會把 `7f858956` 算成 12 次。 +- 新 stable fingerprint 顯示同一 items fingerprint 12h 內是 2 次;這代表之前的 12 次是「同計數重複候選」,不是已證明同一漂移。 -## 2026-05-12 | Source Control Approval Board 低摩擦決策隊列 +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible audit contract + decision candidate dry-run audit 完成、部署、production smoke 完成。 +- T4:Config Drift stable fingerprint / repeat-state / Telegram stage visibility 完成、部署、production smoke 完成。 +- 仍未完成:T5 incident / approval / execution reconciliation、Ansible 真正 check-mode executor / diff / apply / rollback、first-class MCP Gateway enforcement。 -**背景**:統帥批准繼續推進後,下一步原本是 Gitea authenticated read-only inventory;但目前 `GITEA_READONLY_TOKEN` 未提供。本輪因此不使用可 push 的既有 Gitea remote credential 代替 read-only token,避免把 inventory 與寫入權限憑證混在一起。 +## 2026-05-13 | T3 Ansible decision candidate audit 已推版 -**本次交付**: -- 新增 `scripts/security/source-control-approval-board.py`,只讀既有 redacted snapshot,不呼叫 Gitea/GitHub API,不需要 token。 -- 新增 `docs/schemas/source_control_approval_board_v1.schema.json`。 -- 產出 `docs/security/source-control-approval-board.snapshot.json` 與 `docs/security/SOURCE-CONTROL-APPROVAL-BOARD.md`。 -- Board 彙整 8 個 target,其中 7 個為 pending approval:`awoooi`、`clawbot-v5`、`wooo-aiops`、`wooo-infra-config`、`ewoooc`、`bitan-pharmacy`、`tsenyang-website`;`nexu-io/open-design` 維持 scope review。 -- 更新 `SECURITY-SUPPLY-CHAIN-CONTRACT-MANIFEST`,contract count 從 12 增至 13,新增 `source_control_approval_board_v1`。 -- 更新 `SECURITY-SUPPLY-CHAIN-PROGRESS` 與 `AWOOOP-MIRROR-ONLY-CONSUMPTION-CHECKLIST`,讓 AwoooP 可 mirror board 但不得執行 board item。 +**背景**:T3 第一段只讓 truth-chain 看得到 Ansible audit contract 與 repo playbook catalog;但 AI decision path 還不會留下「曾考慮 Ansible、但尚未進 check-mode/apply」的 first-class record。這會讓 Telegram / Operator Console 仍看不出 Ansible 是否真的被 AI 修復鏈評估過。 + +**修正**: +- `awooop_ansible_audit_service.py` 新增 decision candidate audit payload / writer。 +- `decision_manager` 在 auto-execute / manual-approval 分支排程 best-effort `ansible_candidate_matched` audit write。 +- Audit row 明確是 dry-run / audit-only: + - `status=dry_run` + - `input.executor=ansible` + - `input.check_mode=true` + - `input.apply_enabled=false` + - `input.approval_required=true` + - `output.decision_effect=audit_only` +- Docker/container 類 incident 也會命中 188 / 110 Ansible catalog hints;未來新 decision 可在 truth-chain 顯示「有候選、尚未執行 check-mode」。 + +**驗證與推版**: +- Local: + - `py_compile`:pass。 + - `ruff --select F,E9`:pass。 + - `pytest apps/api/tests/test_awooop_truth_chain_service.py apps/api/tests/test_platform_router_order.py apps/api/tests/test_awooop_operator_auth.py -q`:14 passed。 + - Tier 3 adjacent tests:133 passed, 1 existing RuntimeWarning。 + - `git diff --check`:pass。 +- Gitea: + - `3799e0db feat(awooop): audit ansible decision candidates` 已推 `gitea main`。 + - Code Review run `1936`:success。 + - CD run `1935`:success。 + - Deploy marker:`90b9ddb7 chore(cd): deploy 3799e0d [skip ci]`。 +- Production: + - API/Web/Worker image 均為 `192.168.0.110:5000/awoooi/*:3799e0db0d30f29fdc251197634d2fca4c2c67fd`。 + - K3s rollout status:API/Web/Worker success。 + - `/api/v1/health`:healthy,mock_mode=false。 + - Pure function smoke(API pod):DockerContainerUnhealthy 事件可產生 `ansible_candidate_matched` payload,`candidate_count=2`,`check_mode_executed=false`。 + - Truth-chain smoke `INC-20260512-B6C589`: + - `source_type=incident` + - `current_stage=manual_required` + - `stage_status=blocked` + - `needs_human=true` + - `execution.ansible.audit_contract.schema_version=ansible_executor_audit_v1` + - `ansible_candidates=2` + - `mcp_gateway_total=8` + - Truth-chain smoke `7f858956`: + - `source_type=drift_report` + - `current_stage=dedup_or_repeat_updated` + - `stage_status=pending` + - `needs_human=true` + - `repeat_12h=12` + - `outbound_visible=2` + +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible audit contract + decision candidate dry-run audit 完成、部署、production smoke 完成。 +- 仍未完成:Ansible 真正 check-mode executor、diff artifact、apply / rollback audit、T4 drift fingerprint FSM、T5 incident / approval / execution reconciliation、first-class MCP Gateway enforcement。 + +## 2026-05-12 | Security Supply Chain PR #117 累積紀錄 + +**背景**:統帥批准 Kali `192.168.0.112`、開發主機 `192.168.0.111` / `192.168.0.168`、Code Review -> Codex、Gitea -> GitHub 長期遷移納入同一個資安工作項目;同時要求初期不要把資安等級一次拉太高,避免產品、架構與流程變得過度複雜。本支線以乾淨 worktree 建立 PR `#117`,並持續與另一個 AwoooP Session 的 `gitea/main` 同步。 + +**累積交付**: +- 建立 docs-only / contracts-first Security Supply Chain scaffold,涵蓋 Kali、Code Review、Codex、Gitea、GitHub 與 AwoooP mirror-only handoff。 +- 產出 Gitea/GitHub refs diff、Gitea public-only inventory、local remote inventory、GitHub target probe、canonical lineage、110 refs probe、repo-by-repo approval package 與 contract manifest snapshot。 +- 建立 `SOURCE-CONTROL-APPROVAL-BOARD.md`,彙整 8 個 target,其中 7 個為 pending approval;authenticated inventory gate 仍為 `blocked`。 +- 建立 `SOURCE-CONTROL-RECONCILE-PLAN.md`,涵蓋 `awoooi`、`clawbot-v5`、`wooo-aiops` 三個 refs-blocked mapped repos;狀態仍為 `draft_blocked`。 +- Contract manifest 已收斂到 14 個主要 contract,可供 AwoooP mirror / read-only policy / approval candidate 消費,但不得作 execution router。 **邊界**: - 未使用 Gitea write-capable remote credential 做 authenticated inventory。 - 未建立 GitHub repo、未改 visibility、未同步 refs、未切 GitHub primary、未部署。 -- authenticated inventory gate 仍為 `blocked`,等待 read-only token 或 redacted admin export。 +- AwoooP 可 mirror board / plan / policy,但不得執行 board item 或新增高風險 action button。 **驗證**: -- `source-control-approval-board.py` 產生 8 items,pending approval 7。 - JSON / schema / snapshot parse 通過。 - `scripts/security/*.py` 可編譯。 - `git diff --check` 通過。 - PR diff added lines 未命中本輪敏感 token / credential pattern。 -## 2026-05-12 | Security Supply Chain PR #117 與 AwoooP 主線同步 +## 2026-05-12 | T3 Ansible audit surface 第一段 -**背景**:Security Supply Chain docs-only 分支完成首次推版後,另一個 AwoooP Session 已將 `feat(awooop): harden outbound truth chain mirror` 與 deploy marker 推入 `gitea/main`。為避免雙 Session 推進互相衝突,本輪先把最新 `gitea/main` 合入資安分支,再建立 review-only PR。 +**背景**:Telegram / truth-chain live audit 顯示 Ansible 目前仍只是 repo/主機部署工具,沒有出現在 AI 自動化修復鏈路的 first-class audit record;Operator 無法知道「是否被考慮、是否 dry-run、為何沒用」。 -**本次同步**: -- 資安分支 `codex/security-supply-chain-contracts-20260512` 已合入最新 `gitea/main`,merge commit 為 `dc540cba`。 -- 已建立 Gitea PR `#117`:`http://192.168.0.110:3001/wooo/awoooi/pulls/117`。 -- PR 維持 review-only / docs-first / contracts-first;未合併、未部署、未切 GitHub primary。 -- AwoooP 主線 runtime / migration / k8s 變更由 `gitea/main` 保留,本資安分支的 PR diff 仍只呈現資安文件、schema、snapshot 與 read-only tooling。 +**修正**: +- 新增 migration `adr090d_ansible_operation_types.sql`,擴充 `automation_operation_log.operation_type`: + - `ansible_candidate_matched` + - `ansible_check_mode_executed` + - `ansible_apply_executed` + - `ansible_rollback_executed` + - `ansible_execution_skipped` +- 新增 rollback migration `adr090d_ansible_operation_types_down.sql`;`run-migration.yml` 會跳過 `_down.sql`。 +- 新增 `awooop_ansible_audit_service.py`: + - 讀取 automation ops 中的 Ansible operation type/tag/backend。 + - 暴露 repo 既有 playbook catalog hint。 + - 明確標示 `decision_effect=none`,避免把候選 playbook 當成已執行。 +- truth-chain `execution.ansible` 現在會顯示: + - `considered` 是否有真實 Ansible audit record。 + - `records`、`audit_contract`、`candidate_catalog`、`not_used_reason`。 +- `incident_timeline_service` 補 Ansible operation type → stage mapping。 **驗證**: -- `gitea/main` 已是資安分支祖先。 -- `python3 -m py_compile scripts/security/*.py` 通過。 -- `git diff --check gitea/main...HEAD` 通過。 -- JSON / schema / snapshot parse 通過,`security_supply_chain_contract_manifest_v1` 12 個 contracts path check 通過。 -- PR diff added lines 未命中本輪敏感 token / credential pattern。 +- `py_compile`:Ansible audit service / truth-chain / incident timeline / truth-chain tests 通過。 +- `ruff --select F,E9`:All checks passed。 +- `pytest apps/api/tests/test_awooop_truth_chain_service.py apps/api/tests/test_platform_router_order.py apps/api/tests/test_awooop_operator_auth.py -q`:13 passed。 +- `ruby YAML.load_file(".gitea/workflows/run-migration.yml")`:ok。 +- `git diff --check`:ok。 -**下一步**: -- 等 PR review 後再決定是否合併;不得直接切 GitHub primary 或啟動 refs sync。 -- Gitea read-only inventory approval 未批准前,private/internal server-side repo list 保持 blocked。 -- 下一階段仍維持低摩擦 observe-first,不做 runtime blocking。 +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- T0:Truth-chain read-only API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class Gateway enforced path 仍待後續 wave。 +- T3:Ansible first-class audit contract / truth-chain 可見性完成、已部署;尚未把 approval execution path 寫入 Ansible dry-run/check-mode。 +- 下一步:T3 第二段接 decision / approval execution 的 Ansible check-mode audit row,仍不直接 apply。 -## 2026-05-12 | Security Supply Chain docs-only contract manifest +**production push 追加**: +- Gitea `run-migration` run `1933` 顯示 migration 本體已成功: + - `adr090d_ansible_operation_types.sql` 以 owner fallback 套用成功。 +- 但 audit seed 仍失敗,這次不是 `:'commit_sha'`,而是 tools JSON literal 在 unquoted heredoc 下仍保留反斜線: + - `'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb` + - PostgreSQL 回 `invalid input syntax for type json`。 +- 已修 `.gitea/workflows/run-migration.yml`:tools JSON 改為 `'{"psql": 1, "gitea_ci": 1}'::jsonb`。 +- 已補 production `asset_discovery_run` repair audit row: + - `triggered_by=codex:gitea-migration-audit-repair` + - `summary.type=ci_migration_manual_repair` + - `summary.commit_sha=ca80972dc73cb647f8fab3bf9439784c4b8eef7b` +- Production DB constraint 驗證:`automation_operation_log_type_valid` 已包含全部 `ansible_*` operation types。 +- CD 部署: + - `07000dae chore(cd): deploy ca80972 [skip ci]` + - API/Web/Worker image 均為 `ca80972dc73cb647f8fab3bf9439784c4b8eef7b` + - rollout success。 +- Truth-chain smoke(B6C589): + - `truth_status=manual_required/blocked` + - `mcp_gateway_total=8` + - `execution.ansible.considered=false` + - `execution.ansible.records=0` + - `not_used_reason=no automation_operation_log row with Ansible operation type, tag, or executor backend for this source` + - `audit_contract.schema_version=ansible_executor_audit_v1` +- Caveat:下一個 migration push 仍需 live 驗證 `run-migration` audit seed 是否完全通過;本輪 workflow 修正後沒有新的 migration 觸發可重跑。 -**背景**:統帥批准 Kali `192.168.0.112`、開發主機 `192.168.0.111` / `192.168.0.168`、Code Review -> Codex、Gitea -> GitHub 長期遷移納入同一個資安工作項目;同時要求初期不要把資安等級一次拉太高,避免產品、架構與流程變得過度複雜。 +**T3 第二段本地實作**: +- `awooop_ansible_audit_service.py` 新增 decision audit payload/writer: + - 只有 static catalog 有候選 playbook 時才寫 `automation_operation_log`。 + - operation_type=`ansible_candidate_matched`。 + - status=`dry_run`。 + - `input.executor=ansible`、`check_mode=true`、`apply_enabled=false`、`approval_required=true`。 + - `output.decision_effect=audit_only`。 +- `decision_manager` 在 auto-execute / manual-approval 分支都排程 best-effort audit write: + - 不改 executor。 + - 不跑 Ansible。 + - 不阻塞決策和 Telegram。 +- Docker/container 類 incident 也會命中 Ansible catalog hint,讓 B6C589 這類事件後續新 decision 能留下 Ansible candidate audit row。 +- 本地驗證: + - `py_compile`:pass。 + - `ruff --select F,E9`:pass。 + - `pytest test_awooop_truth_chain_service.py test_platform_router_order.py test_awooop_operator_auth.py -q`:14 passed。 + - `git diff --check`:pass。 +- 待推版與 production smoke。 -**本次交付**: -- 建立 Kali / Code Review / GitHub / Gitea / Codex / AwoooP 的 docs-only security supply chain scaffold。 -- 建立 `security_finding_v1`、`coding_task_v1`、`source_control_migration_event_v1`、`gitea_repo_inventory_v1`、`local_git_remote_inventory_v1`、`github_target_probe_v1`、`github_target_decision_v1`、`github_target_repo_approval_package_v1`、`local_repo_canonical_probe_v1`、`git_remote_refs_probe_v1`、`approval_required_event_v1`、`security_rollout_policy_v1`、`security_supply_chain_contract_manifest_v1` schema 草案。 -- 產出 Gitea/GitHub refs diff、Gitea public-only inventory、local remote inventory、GitHub target probe、canonical lineage、110 refs probe、repo-by-repo approval package 與 contract manifest snapshot。 -- 明確採低摩擦 `observe-first` / `mirror_only`:LOW / MEDIUM observation 先 observe / warn;只有 read-only token、repo creation、visibility change、refs sync、secret、deploy、primary switch 等高風險動作才進 approval。 +## 2026-05-12 | run-migration audit seed 再修正 -**邊界**: -- 本輪只做文件、schema、read-only scripts 與 redacted snapshots。 -- 未建立 repo、未修改 visibility、未同步 refs、未切 GitHub primary、未部署、未碰 runtime enforcement。 -- AwoooP 只可 mirror / read-only policy / approval candidate,不可把 manifest 當 execution router。 +**背景**:Gitea `run-migration` 在 `Seed asset_discovery_run (audit)` 再次失敗: + +```text +ERROR: syntax error at or near ":" +LINE 16: 'commit_sha', :'commit_sha', +``` + +**修正**: +- `.gitea/workflows/run-migration.yml` 不再依賴 `psql` 的 `:'commit_sha'` / `:'files_json'` 變數展開。 +- 改由 `jq` 先產生完整 `summary` JSON,再以 shell-safe SQL literal 寫入 `asset_discovery_run.summary`。 +- 保留 owner connection fallback,只修 audit seed,不改 migration apply 流程。 **驗證**: -- JSON / schema parse 通過。 -- `scripts/security/*.py` 可編譯。 -- `git diff --check` 通過。 -- 新增 / 修改內容未命中本輪敏感 token / credential pattern。 +- `ruby -e 'require "yaml"; YAML.load_file(".gitea/workflows/run-migration.yml")'`:yaml ok。 +- 抽出 `Seed asset_discovery_run (audit)` step 後 `bash -n`:通過。 +- mock `psql` 實跑該 step:rendered SQL 已無 `:'...'` psql 變數,並包含 `commit_sha` / `files` JSON。 +- `git diff --check`:通過。 -**下一步**: -- 等 Gitea read-only inventory approval 被批准後,補 private/internal server-side repo list。 -- 逐 repo 取得 owner / visibility / canonical 決策。 -- 對 refs blocked repos 產生 reconcile plan;GitHub primary 仍保持 blocked。 +**整體進度**: +- Wave 0:MOMO PostgreSQL backup → AwoooP 失敗通知接線完成並已推版。 +- Truth-chain T0:read-only truth-chain API 完成、部署、production smoke 完成。 +- T1:Channel Event hardening 完成、部署、production smoke 完成。 +- T2:legacy MCP audit bridge / backfill / truth-chain visibility 完成、部署、production smoke 完成;first-class MCP Gateway enforced path 仍待後續 wave。 +- 本次:CI migration audit seed 紅燈修正完成,待推 Gitea main 觀察下一次 `run-migration`。 +- 下一步:回到 T3 Ansible declarative executor 盤點與 first-class audit surface。 ## 2026-05-12 | Truth-chain T0 read-only API 第一版 @@ -6601,3 +6754,67 @@ gateway_audit_total=0 last_15m=0 bridge_total=0 - 因此目前只能宣稱「T2 bridge 寫入能力已部署並經 rollback smoke 驗證」。 - 尚不能宣稱「所有 MCP / 自建 MCP 都已完全經 AwoooP Gateway 強制治理」;下一段要讓下一個真實 incident / MCP 呼叫自然產生 durable bridge row,或把高頻 caller 改成 first-class `McpGateway`。 + +**T2 backfill / truth-chain visibility 追加**: + +- 新增 `scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql`: + - 將最近 24h 真實 `mcp_audit_log` 鏡像到 `awooop_mcp_gateway_audit`。 + - 以 `gate_result.legacy_audit_id` 做 idempotency key。 + - bridge row 保留 `policy_enforced=false` 與 `not_used_reason`,避免誤判為五閘門已 enforcement。 +- production 已執行 backfill: + +```text +inserted_bridge_rows=1160 +gateway_total=1310 bridge_total=1310 last_24h=1276 +B6C589_gateway_rows=8 failed=8 success=0 +``` + +- truth-chain API 追加 `gate_result` 欄位,並把 JSONB text 解析回物件,讓 UI 能顯示 bridge reason。 + +```text +py_compile: +apps/api/src/services/awooop_truth_chain_service.py +apps/api/tests/test_awooop_truth_chain_service.py +# OK + +ruff F,E9: +# All checks passed + +pytest: +apps/api/tests/test_awooop_truth_chain_service.py +apps/api/tests/test_platform_router_order.py +apps/api/tests/test_awooop_operator_auth.py +# 11 passed +``` + +**效果**: + +- `INC-20260512-B6C589` truth-chain 現在不再是 `awooop_mcp_gateway_audit_empty`。 +- 仍顯示 `manual_required/blocked`,因為 8 個 SSH MCP 都失敗,approval/incident 狀態仍矛盾;這是 T5 要處理,不能用 T2 粉飾成自動修復完成。 + +**production deploy / endpoint smoke 追加(完成)**: + +```text +Gitea: +1928 CD Pipeline b4d367ee -> success +1929 Code Review b4d367ee -> success + +K8s image: +awoooi-api 192.168.0.110:5000/awoooi/api:b4d367eeb463eccda5aec8aa9c90f19897dbd634 +awoooi-worker 192.168.0.110:5000/awoooi/api:b4d367eeb463eccda5aec8aa9c90f19897dbd634 +awoooi-web 192.168.0.110:5000/awoooi/web:b4d367eeb463eccda5aec8aa9c90f19897dbd634 + +health: +http://192.168.0.125:32334/api/v1/health -> 200 healthy + +Truth-chain: +GET /api/v1/platform/truth-chain/INC-20260512-B6C589?project_id=awoooi -> 200 +stage=manual_required status=blocked needs_human=True +blockers=all_evidence_sensors_failed, + approval_resolved_no_action_without_execution, + incident_still_investigating_after_approval +gateway_total=8 legacy_total=8 +first_gateway_tool=legacy:ssh_host:ssh_get_nginx_error_log result=failed +gate_schema=legacy_mcp_bridge_v1 policy_enforced=False +not_used_reason=legacy direct provider path; bridge audit only +``` diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 15f6a18c..5b4be5f9 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1883,12 +1883,83 @@ Phase 6 完成後 - T2 bridge image `94d006ea` 已部署,CD run `1921` success,health 200。 - rollback smoke 證明 `record_mcp_call()` 在同一 transaction 內會同時寫 legacy `mcp_audit_log` 與 `awooop_mcp_gateway_audit` bridge row,且 bridge row 標示 `policy_enforced=false` / `not_used_reason=legacy direct provider path; bridge audit only`;rollback 後兩邊皆未污染 production。 - 部署後短觀察窗內沒有自然新 legacy MCP call(`legacy_mcp_15m=0`),所以 live `awooop_mcp_gateway_audit` total 仍是 0。T2 bridge capability 已上線,但 T2 全退出條件仍需下一個真實 MCP 呼叫產生 durable row,或把高頻 caller 改成 first-class Gateway path。 +- 已執行最近 24h 真實 legacy MCP backfill:`inserted_bridge_rows=1160`,目前 `awooop_mcp_gateway_audit gateway_total=1310 / bridge_total=1310 / last_24h=1276`。`INC-20260512-B6C589` 現在 gateway side 可見 8 筆 MCP,8 failed / 0 success;truth-chain blocker 移除 `awooop_mcp_gateway_audit_empty`,但仍是 `manual_required/blocked`,因為 evidence sensors 全失敗、NO_ACTION approval 無 execution、incident 仍 investigating。 +- truth-chain API 追加回傳 `gate_result`,讓 Operator Console 可直接顯示 `policy_enforced=false` 與 `not_used_reason`,避免把 bridge row 誤認為 first-class Gateway enforcement。 +- `b4d367ee` 已部署,CD run `1928` success。B6C589 endpoint smoke:`gateway_total=8 / legacy_total=8`,第一筆 gateway row 顯示 `gate_schema=legacy_mcp_bridge_v1`、`policy_enforced=False`、`not_used_reason=legacy direct provider path; bridge audit only`;truth status 仍是 `manual_required/blocked`。 **仍未宣稱完成**: - 這只是 legacy bridge,不是把所有呼叫強制改經 AwoooP Gateway;T2 後續仍要把新 MCP caller 收斂到 first-class Gateway path。 --- +### 2026-05-12 晚 (台北) — T3 Ansible declarative executor audit surface 第一段 + +**範圍**: +- `automation_operation_log.operation_type` CHECK 追加 Ansible executor audit states: + `ansible_candidate_matched` / `ansible_check_mode_executed` / + `ansible_apply_executed` / `ansible_rollback_executed` / + `ansible_execution_skipped`。 +- 新增 `awooop_ansible_audit_service.py`,把 repo 既有 Ansible playbook catalog 以 + read-only 方式暴露給 truth-chain。 +- truth-chain `execution.ansible` 改為顯示: + - 是否真的有 `automation_operation_log` Ansible audit record。 + - audit contract / required fields。 + - static catalog keyword hints,且 `decision_effect=none`,避免把候選 playbook 誤判成已自動修復。 +- `incident_timeline_service` 加入 Ansible operation type stage mapping。 + +**已驗證**: +- 本地 `py_compile` / `ruff F,E9` / `git diff --check` 通過。 +- `test_awooop_truth_chain_service.py`、router order、operator auth 共 13 passed。 +- `run-migration.yml` YAML parse 通過;新增 `_down.sql` 會被既有 workflow skip 規則排除。 + +**仍未宣稱完成**: +- 這不是 Ansible 自動修復執行器接線;目前只建立 first-class audit contract 與 truth-chain 可見性。 +- 下一段需把 decision / approval execution path 在「只 dry-run/check-mode」下寫入上述 operation types,再談 apply。 + +**production 追加**: +- Gitea `run-migration` run `1933`:`adr090d_ansible_operation_types.sql` 已成功套用,含 owner fallback。 +- 同 run 的 `Seed asset_discovery_run (audit)` 仍失敗;新根因是 unquoted heredoc 下 tools JSON literal 還寫成 `'{\"psql\": 1, \"gitea_ci\": 1}'::jsonb`,PostgreSQL 視為非法 JSON。 +- 後續修正:workflow tools JSON literal 改成 `'{"psql": 1, "gitea_ci": 1}'::jsonb`。 +- 已補 production `asset_discovery_run` repair audit row(`ci_migration_manual_repair` / `commit_sha=ca80972dc73cb647f8fab3bf9439784c4b8eef7b`)。 +- Production DB constraint 已確認包含全部 `ansible_*` operation types。 +- CD 已部署 `ca80972d` image,deploy marker `07000dae`;API/Web/Worker rollout success。 +- B6C589 truth-chain smoke:`manual_required/blocked`、`mcp_gateway_total=8`、`execution.ansible.considered=false`、`records=0`、not_used_reason 清楚顯示沒有 Ansible audit record。 +- 下一個 migration push 仍需驗證 `run-migration` audit seed live gate,因本輪 workflow 修正後未再新增 migration 觸發重跑。 + +**T3 第二段本地追加**: +- `decision_manager` 在 auto-execute / manual-approval 分支新增 best-effort Ansible candidate audit write。 +- 僅在 catalog 有候選 playbook 時寫 `automation_operation_log`: + `operation_type=ansible_candidate_matched`、`status=dry_run`、 + `input.check_mode=true`、`input.apply_enabled=false`、 + `output.decision_effect=audit_only`。 +- 這仍不是 Ansible 執行器;它只讓 truth-chain 能看到 AI decision path 曾考慮 Ansible candidate,以及為何未進入 check-mode/apply。 +- 本地 `py_compile` / `ruff F,E9` / 14 個 truth-chain/operator/router tests 通過;待推版和 production smoke。 + +**T3 第二段 production verified(2026-05-13 台北)**: +- `3799e0db feat(awooop): audit ansible decision candidates` 已推 Gitea main,Code Review run `1936` success,CD run `1935` success。 +- Deploy marker:`90b9ddb7 chore(cd): deploy 3799e0d [skip ci]`。 +- Production API/Web/Worker image 均為 `3799e0db0d30f29fdc251197634d2fca4c2c67fd`,K3s rollout success,health 200 / `mock_mode=false`。 +- API pod pure smoke:DockerContainerUnhealthy 事件可產生 `ansible_candidate_matched` audit payload,`candidate_count=2`,`check_mode_executed=false`。 +- Truth-chain smoke: + - `INC-20260512-B6C589` → `manual_required/blocked`,`mcp_gateway_total=8`,`execution.ansible.audit_contract=ansible_executor_audit_v1`,`ansible_candidates=2`。 + - `7f858956` → `dedup_or_repeat_updated/pending`,`repeat_12h=12`,`outbound_visible=2`。 +- 邊界:仍未執行 Ansible check-mode / apply / rollback;T3 目前完成的是 first-class candidate audit,而不是修復執行器。 + +**T4 Config Drift fingerprint repeat-state production verified(2026-05-13 台北)**: +- `5b348774 feat(awooop): expose drift repeat fingerprint` 已推 Gitea main,Code Review run `1938` success,CD run `1937` success。 +- Deploy marker:`3d38039b chore(cd): deploy 5b34877 [skip ci]`。 +- 新增 `drift_repeat_state_v1`:以 namespace + sorted drift items 建 stable fingerprint,不再只靠 HIGH/MEDIUM/INFO counts。 +- Truth-chain drift repeat-state 現在回傳 `fingerprint`、`matching_strategy=namespace_and_stable_items_v1`、`operator_stage`、matching reports。 +- Telegram drift narrator 會在 card body 補: + - `流程: drift_scanned → ai_analyzed → pending_human` + - `重複: 12h 內第 N 次同指紋` + - `指紋: dfp_xxxxx` +- Production `7f858956` smoke:`repeat_schema=drift_repeat_state_v1`、`fingerprint=dfp_02dc625b64784b24`、`operator_stage=pending_human`、`repeat_12h=2`、`outbound_visible=2`。 +- 重要校正:舊 count-based repeat 看到 12 次,新 stable item fingerprint 證實同一漂移 fingerprint 只有 2 次;12 次只能稱為同計數候選,不能稱為同一漂移。 +- 邊界:T4 只補可觀測與重複判定,不做 auto-adopt / rollback / ignore。 + +--- + ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d) **觸發**:統帥全景盤查 AI 自動化節點後,發現 Playbook 自動修復鏈路有 3 個結構性斷點。 diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index b0c7434f..5a30a1a3 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -40,7 +40,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: 94d006eac88fd65f6efca817eb392a103ec10d3f + newTag: 5b34877429c16c42f0f894eb4d7f0484711fde9b - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web - newTag: 94d006eac88fd65f6efca817eb392a103ec10d3f + newTag: 5b34877429c16c42f0f894eb4d7f0484711fde9b diff --git a/scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql b/scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql new file mode 100644 index 00000000..509621f3 --- /dev/null +++ b/scripts/ops/awooop-mcp-gateway-bridge-backfill-24h.sql @@ -0,0 +1,69 @@ +-- AwoooP T2 MCP Gateway bridge backfill (24h) +-- 2026-05-12 Codex + ogt +-- +-- Purpose: +-- Mirror real legacy mcp_audit_log rows into awooop_mcp_gateway_audit so +-- truth-chain can show MCP usage for recent incidents while first-class +-- Gateway migration continues. These rows are explicitly marked as bridge +-- records and policy_enforced=false; they are not proof of five-gate +-- Gateway enforcement. +-- +-- Idempotency: +-- gate_result.legacy_audit_id stores the mcp_audit_log.id source key. +-- Re-running this SQL will only insert missing rows. + +WITH inserted AS ( + INSERT INTO awooop_mcp_gateway_audit ( + project_id, + run_id, + trace_id, + agent_id, + tool_name, + input_hash, + output_hash, + gate_result, + result_status, + block_gate, + block_reason, + latency_ms, + created_at + ) + SELECT + 'awoooi' AS project_id, + NULL::uuid AS run_id, + LEFT(COALESCE(src.incident_id, src.session_id), 128) AS trace_id, + LEFT(COALESCE(src.agent_role, 'legacy-mcp-provider'), 128) AS agent_id, + LEFT('legacy:' || src.mcp_server || ':' || src.tool_name, 128) AS tool_name, + encode(digest(COALESCE(src.input_params::text, 'null'), 'sha256'), 'hex') AS input_hash, + CASE + WHEN src.output_result IS NULL THEN NULL + ELSE encode(digest(src.output_result::text, 'sha256'), 'hex') + END AS output_hash, + jsonb_build_object( + 'schema_version', 'legacy_mcp_bridge_v1', + 'gateway_path', 'legacy_backfill', + 'policy_enforced', false, + 'not_used_reason', 'legacy direct provider path; bridge audit only', + 'legacy_audit_id', src.id::text, + 'legacy_mcp_server', src.mcp_server, + 'legacy_tool_name', src.tool_name, + 'flywheel_node', src.flywheel_node + ) AS gate_result, + CASE WHEN src.success IS TRUE THEN 'success' ELSE 'failed' END AS result_status, + NULL::smallint AS block_gate, + CASE WHEN src.success IS TRUE THEN NULL ELSE LEFT(src.error_message, 256) END AS block_reason, + src.duration_ms AS latency_ms, + src.created_at + FROM mcp_audit_log src + WHERE src.created_at > NOW() - INTERVAL '24 hours' + AND NOT EXISTS ( + SELECT 1 + FROM awooop_mcp_gateway_audit dst + WHERE dst.project_id = 'awoooi' + AND dst.gate_result->>'schema_version' = 'legacy_mcp_bridge_v1' + AND dst.gate_result->>'legacy_audit_id' = src.id::text + ) + RETURNING call_id +) +SELECT COUNT(*) AS inserted_bridge_rows +FROM inserted;