awoooi/apps/api/src/services/governance_agent.py

"""AI 自我治理 Agent

四項自檢，每 1 小時執行一次：
1. trust_drift         — Playbook trust_score < 0.2 → 告警建議廢棄
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
3. llm_hallucination   — 近 100 筆 evidence verification_result=failed 比例 > 10%
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
5. slo_compliance      — 4 個 SLO 合規性檢查（ADR-100），違反時降級飛輪行為

所有 check 互相隔離（try/except），任一失敗不阻斷其他項目。

2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 新增 SLO 合規性自檢（ADR-100）
"""

from __future__ import annotations

import asyncio
from datetime import timedelta
from typing import Any

import structlog
from sqlalchemy import func, select

from src.db.base import get_db_context
from src.db.models import (
    AiGovernanceEvent,
    AutoRepairExecution,
    IncidentEvidence,
    KnowledgeEntryRecord,
    PlaybookRecord,
)
from src.models.knowledge import EntryStatus
from src.utils.timezone import now_taipei

logger = structlog.get_logger(__name__)

# =============================================================================
# 閾值常數
# =============================================================================
TRUST_DRIFT_THRESHOLD = 0.2          # playbook trust_score 低於此值 → 告警
# 2026-05-02 ogt + Claude Sonnet 4.6: trust_drift auto-deprecate
# trust < 0.2 + (last_used > N 天前 OR 從沒用過 + 創建超過 N 天) → 自動 deprecate
# 這個 N 設 30 天，給 playbook 充足試用期，避免新提案被早期幾次失敗就廢棄
TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS = 30
KM_STALE_DAYS = 7                     # 知識條目超過幾天未更新視為陳舊
KM_STALE_RATIO = 0.20                 # 陳舊比例超過此值 → 告警
HALLUCINATION_RATE_THRESHOLD = 0.10   # LLM verification failed 比例超過此值 → 告警
EXECUTION_FAIL_RATE_THRESHOLD = 0.15  # 執行失敗比例超過此值 → 告警
RECENT_LIMIT = 100                    # 最近幾筆做統計


# =============================================================================
# GovernanceAgent
# =============================================================================

class GovernanceAgent:
    """AI 自我治理 Agent — 5 項自檢 + 1h 排程

    1-4: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
    5:   slo_compliance（ADR-100 SLO 合規性）

    2026-04-26 P2.2 by Claude
    2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance
    """

    def __init__(self, alerter=None) -> None:
        # alerter: FailoverAlerter instance（可注入，預設從 singleton 取得）
        self._alerter = alerter

    # =========================================================================
    # 1. Playbook 信任度漂移
    # =========================================================================

    async def check_trust_drift(self, emit_alert: bool = True) -> dict[str, Any]:
        """Playbook trust_score < 0.2 → 告警建議廢棄；30 天沒用過的直接 auto-deprecate

        2026-04-26 P2.2 by Claude
        2026-05-02 ogt + Claude Sonnet 4.6: 加 auto_deprecate_low_trust_unused 自治路徑
        守衛條件：trust < 0.2 AND (last_used_at < 30 天前 OR 從未使用且創建超過 30 天)
        → status 改 'deprecated'，alert 改報「N 個告警 + M 個 auto-deprecated」
        2026-05-05 Codex: emit_alert=False 供 W-6 watchdog 查詢統計，維持
        governance_agent 單一入口，但避免與 hourly self-check 發出雙重 Telegram。
        """
        async with get_db_context() as db:
            result = await db.execute(
                select(PlaybookRecord).where(
                    PlaybookRecord.status.not_in(["deprecated", "archived"])
                )
            )
            all_records = result.scalars().all()

            total = len(all_records)
            drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD]

            # auto-deprecate eligibility
            cutoff = now_taipei() - timedelta(days=TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS)
            auto_deprecated_ids: list[str] = []
            kept_ids: list[str] = []
            for r in drifted:
                last = r.last_used_at
                created = r.created_at
                # 沒用過 → 用 created_at 作為「進入系統時間」
                ref_time = last if last is not None else created
                if ref_time is not None and ref_time < cutoff:
                    r.status = "deprecated"
                    auto_deprecated_ids.append(r.playbook_id)
                else:
                    kept_ids.append(r.playbook_id)

            # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 1 修復（P0 silent failure）
            # 原 await db.commit() 在 with 區塊外呼叫，session 已被 context manager
            # 關閉後 auto-commit，二次 commit 拋 InvalidRequestError 被外層 try/except 吞掉
            # 修法：commit 移入 with 區塊內，在 session 有效期間顯式提交
            if auto_deprecated_ids:
                await db.commit()
                logger.info(
                    "governance_trust_drift_auto_deprecated",
                    count=len(auto_deprecated_ids),
                    ids=auto_deprecated_ids[:10],
                )

        if drifted and emit_alert:
            drift_ratio = len(drifted) / total if total > 0 else 0.0
            await self._alert(
                "trust_drift",
                {
                    "status": "warning",
                    "impact": {
                        "drifted_count": len(drifted),
                        "total_playbooks": total,
                        "drift_ratio": round(drift_ratio, 3),
                        "threshold": TRUST_DRIFT_THRESHOLD,
                        "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
                    },
                    "remediation": {
                        "items": [
                            "Auto-deprecate low-trust stale playbooks",
                            "Review candidate playbooks by impact scope and rollback if needed",
                        ],
                        "auto_deprecated_count": len(auto_deprecated_ids),
                        "auto_deprecated_ids": auto_deprecated_ids[:10],
                    },
                    "actionable": {
                        "items": [
                            "立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
                            "必要時人工覆核 kept_ids 中的高風險 Playbook",
                        ],
                        "sample_playbook_ids": kept_ids[:10],
                    },
                    "drifted_count": len(drifted),
                    "auto_deprecated_count": len(auto_deprecated_ids),
                    "auto_deprecated_ids": auto_deprecated_ids[:10],
                    "playbook_ids": kept_ids[:10],
                    "total_playbooks": total,
                    "threshold": TRUST_DRIFT_THRESHOLD,
                    "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
                },
            )

        logger.info(
            "governance_trust_drift_checked",
            total=total,
            drifted=len(drifted),
            auto_deprecated=len(auto_deprecated_ids),
            kept=len(kept_ids),
        )
        drift_ratio = len(drifted) / total if total > 0 else 0.0
        return {
            "checked": total,
            "drifted": len(drifted),
            "drift_ratio": drift_ratio,
            "auto_deprecated": len(auto_deprecated_ids),
            "kept": len(kept_ids),
        }

    # =========================================================================
    # 2. 知識庫衰退
    # =========================================================================

    async def check_knowledge_degradation(self) -> dict[str, Any]:
        """KM 7 天未更新 > 20% 總量 → 告警知識衰退

        2026-04-26 P2.2 by Claude
        """
        stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS)

        async with get_db_context() as db:
            # 非 archived 總數
            total_result = await db.execute(
                select(func.count()).select_from(KnowledgeEntryRecord).where(
                    KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
                )
            )
            total = total_result.scalar() or 0

            # 7 天內未更新（updated_at < cutoff）且非 archived
            stale_result = await db.execute(
                select(func.count()).select_from(KnowledgeEntryRecord).where(
                    KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
                    KnowledgeEntryRecord.updated_at < stale_cutoff,
                )
            )
            stale = stale_result.scalar() or 0

        ratio = stale / total if total > 0 else 0.0

        if total > 0 and ratio > KM_STALE_RATIO:
            await self._alert(
                "knowledge_degradation",
                {
                    "status": "warning",
                    "impact": {
                        "stale_count": stale,
                        "total_count": total,
                        "stale_ratio": round(ratio, 3),
                        "threshold": KM_STALE_RATIO,
                        "stale_days": KM_STALE_DAYS,
                    },
                    "remediation": {
                        "items": [
                            "啟動 KM 反查與自動補齊流程",
                            "關鍵服務告警自動同步到 KM 任務，補齊缺失條目",
                        ],
                        "next_action": "run_kb_growth_healthcheck",
                    },
                    "actionable": {
                        "items": [
                            "每日檢查 ANTI_PATTERN 更新結果",
                            "安排至少 2 位 owner 對 stale條目做快速人工審核",
                        ],
                    },
                    "stale_count": stale,
                    "total_count": total,
                    "stale_ratio": round(ratio, 3),
                    "threshold": KM_STALE_RATIO,
                    "stale_days": KM_STALE_DAYS,
                },
            )

        logger.info(
            "governance_knowledge_degradation_checked",
            total=total,
            stale=stale,
            ratio=round(ratio, 3),
        )
        return {"total": total, "stale": stale, "ratio": round(ratio, 3)}

    # =========================================================================
    # 3. LLM 幻覺率
    # =========================================================================

    async def check_llm_hallucination(self) -> dict[str, Any]:
        """最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警

        verification_result 可能值：success / degraded / failed / timeout
        只有 'failed' 視為幻覺（LLM 判斷錯誤導致執行後驗證失敗）

        2026-04-26 P2.2 by Claude
        """
        async with get_db_context() as db:
            # 取最近 RECENT_LIMIT 筆有 verification_result 的記錄
            result = await db.execute(
                select(IncidentEvidence.verification_result)
                .where(IncidentEvidence.verification_result.is_not(None))
                .order_by(IncidentEvidence.collected_at.desc())
                .limit(RECENT_LIMIT)
            )
            rows = result.scalars().all()

        total = len(rows)
        if total == 0:
            logger.info("governance_hallucination_checked", total=0, rate=0.0)
            return {"total": 0, "failed": 0, "rate": 0.0}

        failed = sum(1 for r in rows if r == "failed")
        rate = failed / total

        if rate > HALLUCINATION_RATE_THRESHOLD:
            await self._alert(
                "llm_hallucination",
                {
                    "status": "warning",
                    "impact": {
                        "failed_count": failed,
                        "total_checked": total,
                        "hallucination_rate": round(rate, 3),
                        "threshold": HALLUCINATION_RATE_THRESHOLD,
                    },
                    "remediation": {
                        "items": [
                            "檢核 AI 建議來源與 evidence snapshot 一致性",
                            "檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
                        ],
                        "next_action": "run_knowledge_gap_audit",
                        "hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
                    },
                    "actionable": {
                        "items": [
                            "啟動 `playbook_evidence` 對齊補償流程",
                            "調整 verify timeout 與降級策略，避免過度信任低品質證據",
                        ],
                    },
                    "failed_count": failed,
                    "total_checked": total,
                    "hallucination_rate": round(rate, 3),
                    "threshold": HALLUCINATION_RATE_THRESHOLD,
                },
            )

        logger.info(
            "governance_hallucination_checked",
            total=total,
            failed=failed,
            rate=round(rate, 3),
        )
        return {"total": total, "failed": failed, "rate": round(rate, 3)}

    # =========================================================================
    # 4. 執行失敗率 (Blast Radius)
    # =========================================================================

    async def check_execution_blast_radius(self) -> dict[str, Any]:
        """最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警

        2026-04-26 P2.2 by Claude
        """
        async with get_db_context() as db:
            result = await db.execute(
                select(AutoRepairExecution.success)
                .order_by(AutoRepairExecution.created_at.desc())
                .limit(RECENT_LIMIT)
            )
            rows = result.scalars().all()

        total = len(rows)
        if total == 0:
            logger.info("governance_blast_radius_checked", total=0, rate=0.0)
            return {"total": 0, "failed": 0, "rate": 0.0}

        failed = sum(1 for r in rows if not r)
        rate = failed / total

        if rate > EXECUTION_FAIL_RATE_THRESHOLD:
            await self._alert(
                "execution_blast_radius",
                {
                    "status": "warning",
                    "impact": {
                        "failed_count": failed,
                        "total_executions": total,
                        "failure_rate": round(rate, 3),
                        "threshold": EXECUTION_FAIL_RATE_THRESHOLD,
                    },
                    "remediation": {
                        "items": [
                            "鎖定失敗 playbook 清單，關閉高風險自動執行",
                            "比對 incident evidence 與 post_execution_verification 失敗原因",
                        ],
                        "next_action": "pause_auto_repair_for_top_failing_playbooks",
                        "hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
                    },
                    "actionable": {
                        "items": [
                            "跑 `run_self_check` 快照與失敗 playbook 熱點報表",
                            "必要時啟用 emergency fallback 路由進人工審核",
                        ],
                    },
                    "failed_count": failed,
                    "total_executions": total,
                    "failure_rate": round(rate, 3),
                    "threshold": EXECUTION_FAIL_RATE_THRESHOLD,
                },
            )

        logger.info(
            "governance_blast_radius_checked",
            total=total,
            failed=failed,
            rate=round(rate, 3),
        )
        return {"total": total, "failed": failed, "rate": round(rate, 3)}

    # =========================================================================
    # 5. SLO 合規性（ADR-100）
    # =========================================================================

    async def check_slo_compliance(self) -> dict[str, Any]:
        """SLO 4 項合規性檢查 — 違反時降級飛輪行為

        從 Prometheus Recording rules 讀取 SLI 值，
        與硬紅線閾值比對，違反時呼叫 _alert() 寫 PG + 推 Telegram。

        SLO 1 自主化率:     sli:autonomy_rate:5m      硬紅線 < 0.70
        SLO 2 決策準確率:   sli:decision_accuracy:5m  硬紅線 < 0.85
        SLO 3 信心校準:     sli:confidence_calibration:1h 硬紅線 < 0.70
        SLO 4 KM 增長率:    knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5

        2026-04-27 P3.4 by Claude — AI SLO（ADR-100）
        """
        import httpx
        import math

        from src.core.config import settings

        prom_url = getattr(settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090")

        queries: dict[str, str] = {
            "autonomy_rate": "sli:autonomy_rate:5m",
            "decision_accuracy": "sli:decision_accuracy:5m",
            "confidence_calibration": "sli:confidence_calibration:1h",
            "km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
        }
        # 硬紅線：低於此值必須告警（非軟性警告）
        hard_red_lines: dict[str, float] = {
            "autonomy_rate": 0.70,
            "decision_accuracy": 0.85,
            "confidence_calibration": 0.70,
            "km_growth_rate": 5.0,
        }
        # SLO 目標值（供日誌記錄）
        slo_targets: dict[str, float] = {
            "autonomy_rate": 0.80,
            "decision_accuracy": 0.90,
            "confidence_calibration": 0.80,
            "km_growth_rate": 20.0,
        }

        results: dict[str, Any] = {}

        async with httpx.AsyncClient(timeout=5.0) as client:
            for name, query in queries.items():
                try:
                    resp = await client.get(
                        f"{prom_url}/api/v1/query",
                        params={"query": query},
                    )
                    data = resp.json()
                    if data.get("status") == "success":
                        result_list = data.get("data", {}).get("result", [])
                        # 2026-04-28 ogt + Claude Opus 4.7: P0-1 假警報止血
                        # 空 result = Prometheus 查無資料（metric 未 emit / rule 未部署），不等於 SLO=0
                        # ADR-100 emitter 全部尚未實作（automation_operation_log_total 等 4 個 counter 零定義）
                        # 不可 fallback 0.0，否則必觸發 violated=True 噴假警報
                        if not result_list:
                            results[name] = {
                                "name": name,
                                "status": "skipped",
                                "error": "no_data",
                                "reason": "prometheus_empty_result_metric_not_emitted",
                                "hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入，或 multiprocess 目錄未掛載",
                            }
                            logger.warning(
                                "governance_slo_no_data",
                                slo=name,
                                query=query,
                                hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
                            )
                            continue
                        value = float(result_list[0]["value"][1])
                        if not math.isfinite(value):
                            results[name] = {
                                "name": name,
                                "status": "skipped",
                                "error": "non_finite_value",
                                "reason": "prometheus_nan_or_inf",
                                "hint": "SLO 分母目前沒有足夠事件，等待下一個有效樣本再評估",
                            }
                            logger.warning(
                                "governance_slo_non_finite",
                                slo=name,
                                query=query,
                                value=str(result_list[0]["value"][1]),
                            )
                            continue
                        threshold = hard_red_lines[name]
                        target = slo_targets[name]
                        violated = value < threshold

                        results[name] = {
                            "name": name,
                            "status": "violated" if violated else "ok",
                            "value": round(value, 4),
                            "slo_target": target,
                            "hard_red_line": threshold,
                            "gap": round(threshold - value, 4) if violated else round(value - target, 4),
                            "violated": violated,
                        }

                        if violated:
                            await self._alert(
                                f"slo_{name}_violation",
                                {
                                    "status": "violation",
                                    "impact": {
                                        "name": name,
                                        "value": round(value, 4),
                                        "target": target,
                                        "threshold": threshold,
                                        "gap": round(threshold - value, 4),
                                    },
                                    "remediation": {
                                        "items": [
                                            "Pause auto-scaling or risky auto-fix tasks",
                                            "Review evidence/decision traces and adjust policy thresholds",
                                        ],
                                        "next_action": "trigger_flywheel_safeguard",
                                    },
                                    "actionable": {
                                        "items": [
                                            "Check verifier lag and post-exec learning health",
                                            "Run emergency incident audit on failed approvals",
                                        ],
                                    },
                                },
                            )
                            logger.warning(
                                "governance_slo_violated",
                                slo=name,
                                value=round(value, 4),
                                hard_red_line=threshold,
                            )
                        elif value == 0 and threshold <= 0:
                            logger.warning(
                                "governance_slo_unexpected_zero",
                                slo=name,
                                value=round(value, 4),
                            )
                        else:
                            logger.info(
                                "governance_slo_ok",
                                slo=name,
                                value=round(value, 4),
                                target=target,
                            )
                    else:
                        results[name] = {
                            "name": name,
                            "status": "error",
                            "error": "prometheus_query_failed",
                            "response_status": data.get("status"),
                        }
                        logger.warning(
                            "governance_slo_prometheus_error",
                            slo=name,
                            query=query,
                            response_status=data.get("status"),
                        )
                except Exception as e:
                    results[name] = {
                        "name": name,
                        "status": "error",
                        "error": str(e),
                    }
                    logger.warning("governance_slo_check_error", slo=name, error=str(e))

        # 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
        # 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
        # 防止 dashboard 把 no_data 當 pass 顯示
        violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
        skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
        ok_count = sum(
            1 for v in results.values()
            if isinstance(v, dict)
            and v.get("status") == "ok"
        )
        error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
        results["_meta"] = {
            "violated_count": violated_count,
            "skipped_count": skipped_count,
            "ok_count": ok_count,
            "error_count": error_count,
            "all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
            "all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
            "status": (
                "no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
                else "violated" if violated_count > 0
                else "ok"
            ),
        }
        logger.info(
            "governance_slo_compliance_complete",
            results=results,
            violated=violated_count,
            skipped=skipped_count,
            ok=ok_count,
            status=results["_meta"]["status"],
        )
        return results

    # =========================================================================
    # 全跑（exception 隔離）
    # =========================================================================

    async def run_self_check(self) -> dict[str, Any]:
        """5 項全跑，每項獨立 try/except 隔離，任一失敗不影響其他項目

        2026-04-26 P2.2 by Claude
        2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance（ADR-100）
        """
        results: dict[str, Any] = {}
        checks = [
            ("trust_drift", self.check_trust_drift),
            ("knowledge_degradation", self.check_knowledge_degradation),
            ("llm_hallucination", self.check_llm_hallucination),
            ("execution_blast_radius", self.check_execution_blast_radius),
            ("slo_compliance", self.check_slo_compliance),
        ]

        for check_name, check_func in checks:
            try:
                results[check_name] = await check_func()
            except Exception as e:
                logger.exception(
                    "governance_check_failed",
                    check=check_name,
                    error=str(e),
                )
                results[check_name] = {"error": str(e)}

        # 2026-04-27 Wave8-X3 by Claude — B8 全失敗聚合告警
        # ≥3 項失敗代表治理機制本身故障，必須送出緊急告警
        failed_checks = [k for k, v in results.items() if isinstance(v, dict) and "error" in v]
        if len(failed_checks) >= 3:
            try:
                await self._alert(
                    "governance_self_failure",
                    {
                        "status": "critical",
                        "impact": {
                            "failed_checks": failed_checks,
                            "total_checks": 5,  # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
                            "errors": {k: results[k].get("error") for k in failed_checks},
                        },
                        "remediation": {
                            "items": [
                                "暫停非關鍵治理自動化接收鏈路",
                                "聚焦治理執行路徑錯誤並補齊 fallback",
                            ],
                            "next_action": "investigate_governance_pipeline_health",
                        },
                        "actionable": {
                            "items": [
                                "檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
                                "確認 DB 寫入與 Prometheus fetch 未被上游干擾",
                            ],
                        },
                    },
                )
            except Exception:
                logger.exception("governance_self_failure_alert_failed")

        # 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
        # SLO 全 skipped 是「資料未產生」（emitter 未實作）不是「治理機制故障」
        # 用獨立 alert 區分，避免污染 self_failure 計數
        slo_meta = (
            results.get("slo_compliance", {}).get("_meta")
            if isinstance(results.get("slo_compliance"), dict)
            else None
        )
        if slo_meta and slo_meta.get("all_skipped"):
            try:
                await self._alert(
                    "governance_slo_data_gap",
                    {
                        "status": "warning",
                        "impact": {
                            "reason": "all_slo_metrics_not_emitted",
                            "skipped_count": slo_meta.get("skipped_count", 0),
                            "all_slo_metrics_not_emitted": True,
                        },
                        "remediation": {
                            "items": [
                                "補齊 ADR-100 SLO emitter（automation_operation_log_total / post_execution_verification_total / knowledge_entries_total）",
                                "確認 Prometheus recording rules 已載入，且 API Pod multiprocess 目錄可寫",
                            ],
                            "next_action": "run_adr100_slo_emit_playbook",
                            "hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
                        },
                        "actionable": {
                            "items": [
                                "先確認 /metrics 是否已輸出 ADR-100 底層指標",
                                "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
                            ],
                        },
                    },
                )
            except Exception:
                logger.exception("governance_slo_data_gap_alert_failed")

        logger.info("governance_self_check_complete", results=results)
        return results

    # =========================================================================
    # 告警輸出
    # =========================================================================

    async def _alert(self, event_type: str, payload: dict[str, Any]) -> None:
        """structlog 告警 + PG 持久化 + Telegram 推送（via FailoverAlerter）

        2026-04-26 P2.2 by Claude
        2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修（P0.1）: 補 PG 寫入 ai_governance_events
        ADR-085 鐵律：AI 學習成果不可存 Cache，必須落地 PG
        """
        # 1. 寫 PG（ADR-085 鐵律 — 失敗不阻斷主流程）
        try:
            from sqlalchemy import insert as _sa_insert
            async with get_db_context() as db:
                await db.execute(
                    _sa_insert(AiGovernanceEvent).values(
                        event_type=event_type,
                        details=payload,
                    )
                )
                await db.commit()
        except Exception as _pg_err:
            logger.warning("governance_pg_write_failed", error=str(_pg_err))

        # 2. structlog（保留既有行為）
        logger.warning("governance_alert", event_type=event_type, **payload)

        # Lazy import：延遲到實際呼叫時才取 alerter，避免啟動時循環依賴
        alerter = self._alerter
        if alerter is None:
            try:
                from src.services.failover_alerter import get_failover_alerter
                alerter = get_failover_alerter()
            except Exception as e:
                logger.warning("governance_alerter_get_failed", error=str(e))
                return

        try:
            await alerter.alert_governance(event_type, payload)
        except Exception as e:
            logger.warning("governance_telegram_alert_failed", error=str(e))


# =============================================================================
# Singleton + 排程迴圈
# =============================================================================

_agent: GovernanceAgent | None = None


def get_governance_agent() -> GovernanceAgent:
    """取得 GovernanceAgent singleton

    2026-04-26 P2.2 by Claude
    """
    global _agent
    if _agent is None:
        _agent = GovernanceAgent()
    return _agent


def reset_governance_agent() -> None:
    """重置 singleton（測試用）

    2026-04-26 P2.2 by Claude
    """
    global _agent
    _agent = None


async def run_governance_loop(interval_seconds: int = 3600) -> None:
    """每 1 小時執行一次 GovernanceAgent.run_self_check()

    沿用 main.py 的 asyncio.create_task + sleep 迴圈模式（無 APScheduler）。
    coalesce 效果：每次 sleep interval_seconds，不堆積多次執行。

    2026-04-26 P2.2 by Claude
    """
    agent = get_governance_agent()
    while True:
        try:
            await agent.run_self_check()
        except Exception as e:
            logger.warning("governance_loop_error", error=str(e))
        await asyncio.sleep(interval_seconds)