Files
awoooi/apps/api/src/services/governance_agent.py
Your Name 21dcfbd991
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 22s
CD Pipeline / tests (push) Successful in 1m6s
CD Pipeline / build-and-deploy (push) Successful in 5m17s
CD Pipeline / post-deploy-checks (push) Successful in 1m38s
fix(governance): collapse km slo fallback series
2026-05-14 19:37:15 +08:00

780 lines
34 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""AI 自我治理 Agent
四項自檢,每 1 小時執行一次:
1. trust_drift — Playbook trust_score < 0.2 → 告警建議廢棄
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
5. slo_compliance — 4 個 SLO 合規性檢查ADR-100違反時降級飛輪行為
所有 check 互相隔離try/except任一失敗不阻斷其他項目。
2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 新增 SLO 合規性自檢ADR-100
"""
from __future__ import annotations
import asyncio
from datetime import timedelta
from typing import Any
import structlog
from sqlalchemy import func, select
from src.db.base import get_db_context
from src.db.models import (
AiGovernanceEvent,
AutoRepairExecution,
IncidentEvidence,
KnowledgeEntryRecord,
PlaybookRecord,
)
from src.models.knowledge import EntryStatus
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# =============================================================================
# 閾值常數
# =============================================================================
TRUST_DRIFT_THRESHOLD = 0.2 # playbook trust_score 低於此值 → 告警
# 2026-05-02 ogt + Claude Sonnet 4.6: trust_drift auto-deprecate
# trust < 0.2 + (last_used > N 天前 OR 從沒用過 + 創建超過 N 天) → 自動 deprecate
# 這個 N 設 30 天,給 playbook 充足試用期,避免新提案被早期幾次失敗就廢棄
TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS = 30
KM_STALE_DAYS = 7 # 知識條目超過幾天未更新視為陳舊
KM_STALE_RATIO = 0.20 # 陳舊比例超過此值 → 告警
HALLUCINATION_RATE_THRESHOLD = 0.10 # LLM verification failed 比例超過此值 → 告警
EXECUTION_FAIL_RATE_THRESHOLD = 0.15 # 執行失敗比例超過此值 → 告警
RECENT_LIMIT = 100 # 最近幾筆做統計
# =============================================================================
# GovernanceAgent
# =============================================================================
class GovernanceAgent:
"""AI 自我治理 Agent — 5 項自檢 + 1h 排程
1-4: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
5: slo_complianceADR-100 SLO 合規性)
2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance
"""
def __init__(self, alerter=None) -> None:
# alerter: FailoverAlerter instance可注入預設從 singleton 取得)
self._alerter = alerter
# =========================================================================
# 1. Playbook 信任度漂移
# =========================================================================
async def check_trust_drift(self, emit_alert: bool = True) -> dict[str, Any]:
"""Playbook trust_score < 0.2 → 告警建議廢棄30 天沒用過的直接 auto-deprecate
2026-04-26 P2.2 by Claude
2026-05-02 ogt + Claude Sonnet 4.6: 加 auto_deprecate_low_trust_unused 自治路徑
守衛條件trust < 0.2 AND (last_used_at < 30 天前 OR 從未使用且創建超過 30 天)
→ status 改 'deprecated'alert 改報「N 個告警 + M 個 auto-deprecated」
2026-05-05 Codex: emit_alert=False 供 W-6 watchdog 查詢統計,維持
governance_agent 單一入口,但避免與 hourly self-check 發出雙重 Telegram。
"""
async with get_db_context() as db:
result = await db.execute(
select(PlaybookRecord).where(
PlaybookRecord.status.not_in(["deprecated", "archived"])
)
)
all_records = result.scalars().all()
total = len(all_records)
drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD]
# auto-deprecate eligibility
cutoff = now_taipei() - timedelta(days=TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS)
auto_deprecated_ids: list[str] = []
kept_ids: list[str] = []
for r in drifted:
last = r.last_used_at
created = r.created_at
# 沒用過 → 用 created_at 作為「進入系統時間」
ref_time = last if last is not None else created
if ref_time is not None and ref_time < cutoff:
r.status = "deprecated"
auto_deprecated_ids.append(r.playbook_id)
else:
kept_ids.append(r.playbook_id)
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 1 修復P0 silent failure
# 原 await db.commit() 在 with 區塊外呼叫session 已被 context manager
# 關閉後 auto-commit二次 commit 拋 InvalidRequestError 被外層 try/except 吞掉
# 修法commit 移入 with 區塊內,在 session 有效期間顯式提交
if auto_deprecated_ids:
await db.commit()
logger.info(
"governance_trust_drift_auto_deprecated",
count=len(auto_deprecated_ids),
ids=auto_deprecated_ids[:10],
)
if drifted and emit_alert:
drift_ratio = len(drifted) / total if total > 0 else 0.0
await self._alert(
"trust_drift",
{
"status": "warning",
"impact": {
"drifted_count": len(drifted),
"total_playbooks": total,
"drift_ratio": round(drift_ratio, 3),
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
"remediation": {
"items": [
"Auto-deprecate low-trust stale playbooks",
"Review candidate playbooks by impact scope and rollback if needed",
],
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
},
"actionable": {
"items": [
"立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
"必要時人工覆核 kept_ids 中的高風險 Playbook",
],
"sample_playbook_ids": kept_ids[:10],
},
"drifted_count": len(drifted),
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
"playbook_ids": kept_ids[:10],
"total_playbooks": total,
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
)
logger.info(
"governance_trust_drift_checked",
total=total,
drifted=len(drifted),
auto_deprecated=len(auto_deprecated_ids),
kept=len(kept_ids),
)
drift_ratio = len(drifted) / total if total > 0 else 0.0
return {
"checked": total,
"drifted": len(drifted),
"drift_ratio": drift_ratio,
"auto_deprecated": len(auto_deprecated_ids),
"kept": len(kept_ids),
}
# =========================================================================
# 2. 知識庫衰退
# =========================================================================
async def check_knowledge_degradation(self) -> dict[str, Any]:
"""KM 7 天未更新 > 20% 總量 → 告警知識衰退
2026-04-26 P2.2 by Claude
"""
stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS)
async with get_db_context() as db:
# 非 archived 總數
total_result = await db.execute(
select(func.count()).select_from(KnowledgeEntryRecord).where(
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
)
)
total = total_result.scalar() or 0
# 7 天內未更新updated_at < cutoff且非 archived
stale_result = await db.execute(
select(func.count()).select_from(KnowledgeEntryRecord).where(
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
KnowledgeEntryRecord.updated_at < stale_cutoff,
)
)
stale = stale_result.scalar() or 0
ratio = stale / total if total > 0 else 0.0
if total > 0 and ratio > KM_STALE_RATIO:
await self._alert(
"knowledge_degradation",
{
"status": "warning",
"impact": {
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
},
"remediation": {
"items": [
"啟動 KM 反查與自動補齊流程",
"關鍵服務告警自動同步到 KM 任務,補齊缺失條目",
],
"next_action": "run_kb_growth_healthcheck",
},
"actionable": {
"items": [
"每日檢查 ANTI_PATTERN 更新結果",
"安排至少 2 位 owner 對 stale條目做快速人工審核",
],
},
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
},
)
logger.info(
"governance_knowledge_degradation_checked",
total=total,
stale=stale,
ratio=round(ratio, 3),
)
return {"total": total, "stale": stale, "ratio": round(ratio, 3)}
# =========================================================================
# 3. LLM 幻覺率
# =========================================================================
async def check_llm_hallucination(self) -> dict[str, Any]:
"""最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警
verification_result 可能值success / degraded / failed / timeout
只有 'failed' 視為幻覺LLM 判斷錯誤導致執行後驗證失敗)
2026-04-26 P2.2 by Claude
"""
async with get_db_context() as db:
# 取最近 RECENT_LIMIT 筆有 verification_result 的記錄
result = await db.execute(
select(IncidentEvidence.verification_result)
.where(IncidentEvidence.verification_result.is_not(None))
.order_by(IncidentEvidence.collected_at.desc())
.limit(RECENT_LIMIT)
)
rows = result.scalars().all()
total = len(rows)
if total == 0:
logger.info("governance_hallucination_checked", total=0, rate=0.0)
return {"total": 0, "failed": 0, "rate": 0.0}
failed = sum(1 for r in rows if r == "failed")
rate = failed / total
if rate > HALLUCINATION_RATE_THRESHOLD:
await self._alert(
"llm_hallucination",
{
"status": "warning",
"impact": {
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
"threshold": HALLUCINATION_RATE_THRESHOLD,
},
"remediation": {
"items": [
"檢核 AI 建議來源與 evidence snapshot 一致性",
"檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
],
"next_action": "run_knowledge_gap_audit",
"hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
},
"actionable": {
"items": [
"啟動 `playbook_evidence` 對齊補償流程",
"調整 verify timeout 與降級策略,避免過度信任低品質證據",
],
},
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
"threshold": HALLUCINATION_RATE_THRESHOLD,
},
)
logger.info(
"governance_hallucination_checked",
total=total,
failed=failed,
rate=round(rate, 3),
)
return {"total": total, "failed": failed, "rate": round(rate, 3)}
# =========================================================================
# 4. 執行失敗率 (Blast Radius)
# =========================================================================
async def check_execution_blast_radius(self) -> dict[str, Any]:
"""最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警
2026-04-26 P2.2 by Claude
"""
async with get_db_context() as db:
result = await db.execute(
select(AutoRepairExecution.success)
.order_by(AutoRepairExecution.created_at.desc())
.limit(RECENT_LIMIT)
)
rows = result.scalars().all()
total = len(rows)
if total == 0:
logger.info("governance_blast_radius_checked", total=0, rate=0.0)
return {"total": 0, "failed": 0, "rate": 0.0}
failed = sum(1 for r in rows if not r)
rate = failed / total
if rate > EXECUTION_FAIL_RATE_THRESHOLD:
await self._alert(
"execution_blast_radius",
{
"status": "warning",
"impact": {
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
},
"remediation": {
"items": [
"鎖定失敗 playbook 清單,關閉高風險自動執行",
"比對 incident evidence 與 post_execution_verification 失敗原因",
],
"next_action": "pause_auto_repair_for_top_failing_playbooks",
"hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
},
"actionable": {
"items": [
"跑 `run_self_check` 快照與失敗 playbook 熱點報表",
"必要時啟用 emergency fallback 路由進人工審核",
],
},
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
},
)
logger.info(
"governance_blast_radius_checked",
total=total,
failed=failed,
rate=round(rate, 3),
)
return {"total": total, "failed": failed, "rate": round(rate, 3)}
# =========================================================================
# 5. SLO 合規性ADR-100
# =========================================================================
async def check_slo_compliance(self) -> dict[str, Any]:
"""SLO 4 項合規性檢查 — 違反時降級飛輪行為
從 Prometheus Recording rules 讀取 SLI 值,
與硬紅線閾值比對,違反時呼叫 _alert() 寫 PG + 推 Telegram。
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
2026-04-27 P3.4 by Claude — AI SLOADR-100
"""
import httpx
import math
from src.core.config import settings
prom_url = getattr(settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090")
queries: dict[str, str] = {
"autonomy_rate": "sli:autonomy_rate:5m",
"decision_accuracy": "sli:decision_accuracy:5m",
"confidence_calibration": "sli:confidence_calibration:1h",
"km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
}
# 硬紅線:低於此值必須告警(非軟性警告)
hard_red_lines: dict[str, float] = {
"autonomy_rate": 0.70,
"decision_accuracy": 0.85,
"confidence_calibration": 0.70,
"km_growth_rate": 5.0,
}
# SLO 目標值(供日誌記錄)
slo_targets: dict[str, float] = {
"autonomy_rate": 0.80,
"decision_accuracy": 0.90,
"confidence_calibration": 0.80,
"km_growth_rate": 20.0,
}
results: dict[str, Any] = {}
async with httpx.AsyncClient(timeout=5.0) as client:
for name, query in queries.items():
try:
resp = await client.get(
f"{prom_url}/api/v1/query",
params={"query": query},
)
data = resp.json()
if data.get("status") == "success":
result_list = data.get("data", {}).get("result", [])
# 2026-04-28 ogt + Claude Opus 4.7: P0-1 假警報止血
# 空 result = Prometheus 查無資料metric 未 emit / rule 未部署),不等於 SLO=0
# ADR-100 emitter 全部尚未實作automation_operation_log_total 等 4 個 counter 零定義)
# 不可 fallback 0.0,否則必觸發 violated=True 噴假警報
if not result_list:
results[name] = {
"name": name,
"status": "skipped",
"error": "no_data",
"reason": "prometheus_empty_result_metric_not_emitted",
"hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載",
}
logger.warning(
"governance_slo_no_data",
slo=name,
query=query,
hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
)
continue
value = float(result_list[0]["value"][1])
if not math.isfinite(value):
results[name] = {
"name": name,
"status": "skipped",
"error": "non_finite_value",
"reason": "prometheus_nan_or_inf",
"hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估",
}
logger.warning(
"governance_slo_non_finite",
slo=name,
query=query,
value=str(result_list[0]["value"][1]),
)
continue
threshold = hard_red_lines[name]
target = slo_targets[name]
violated = value < threshold
results[name] = {
"name": name,
"status": "violated" if violated else "ok",
"value": round(value, 4),
"slo_target": target,
"hard_red_line": threshold,
"gap": round(threshold - value, 4) if violated else round(value - target, 4),
"violated": violated,
}
if violated:
await self._alert(
f"slo_{name}_violation",
{
"status": "violation",
"impact": {
"name": name,
"value": round(value, 4),
"target": target,
"threshold": threshold,
"gap": round(threshold - value, 4),
},
"remediation": {
"items": [
"Pause auto-scaling or risky auto-fix tasks",
"Review evidence/decision traces and adjust policy thresholds",
],
"next_action": "trigger_flywheel_safeguard",
},
"actionable": {
"items": [
"Check verifier lag and post-exec learning health",
"Run emergency incident audit on failed approvals",
],
},
},
)
logger.warning(
"governance_slo_violated",
slo=name,
value=round(value, 4),
hard_red_line=threshold,
)
elif value == 0 and threshold <= 0:
logger.warning(
"governance_slo_unexpected_zero",
slo=name,
value=round(value, 4),
)
else:
logger.info(
"governance_slo_ok",
slo=name,
value=round(value, 4),
target=target,
)
else:
results[name] = {
"name": name,
"status": "error",
"error": "prometheus_query_failed",
"response_status": data.get("status"),
}
logger.warning(
"governance_slo_prometheus_error",
slo=name,
query=query,
response_status=data.get("status"),
)
except Exception as e:
results[name] = {
"name": name,
"status": "error",
"error": str(e),
}
logger.warning("governance_slo_check_error", slo=name, error=str(e))
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
# 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
# 防止 dashboard 把 no_data 當 pass 顯示
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
ok_count = sum(
1 for v in results.values()
if isinstance(v, dict)
and v.get("status") == "ok"
)
error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
results["_meta"] = {
"violated_count": violated_count,
"skipped_count": skipped_count,
"ok_count": ok_count,
"error_count": error_count,
"all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
"all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
"status": (
"no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
else "violated" if violated_count > 0
else "ok"
),
}
logger.info(
"governance_slo_compliance_complete",
results=results,
violated=violated_count,
skipped=skipped_count,
ok=ok_count,
status=results["_meta"]["status"],
)
return results
# =========================================================================
# 全跑exception 隔離)
# =========================================================================
async def run_self_check(self) -> dict[str, Any]:
"""5 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_complianceADR-100
"""
results: dict[str, Any] = {}
checks = [
("trust_drift", self.check_trust_drift),
("knowledge_degradation", self.check_knowledge_degradation),
("llm_hallucination", self.check_llm_hallucination),
("execution_blast_radius", self.check_execution_blast_radius),
("slo_compliance", self.check_slo_compliance),
]
for check_name, check_func in checks:
try:
results[check_name] = await check_func()
except Exception as e:
logger.exception(
"governance_check_failed",
check=check_name,
error=str(e),
)
results[check_name] = {"error": str(e)}
# 2026-04-27 Wave8-X3 by Claude — B8 全失敗聚合告警
# ≥3 項失敗代表治理機制本身故障,必須送出緊急告警
failed_checks = [k for k, v in results.items() if isinstance(v, dict) and "error" in v]
if len(failed_checks) >= 3:
try:
await self._alert(
"governance_self_failure",
{
"status": "critical",
"impact": {
"failed_checks": failed_checks,
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
"errors": {k: results[k].get("error") for k in failed_checks},
},
"remediation": {
"items": [
"暫停非關鍵治理自動化接收鏈路",
"聚焦治理執行路徑錯誤並補齊 fallback",
],
"next_action": "investigate_governance_pipeline_health",
},
"actionable": {
"items": [
"檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
"確認 DB 寫入與 Prometheus fetch 未被上游干擾",
],
},
},
)
except Exception:
logger.exception("governance_self_failure_alert_failed")
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
# SLO 全 skipped 是「資料未產生」emitter 未實作)不是「治理機制故障」
# 用獨立 alert 區分,避免污染 self_failure 計數
slo_meta = (
results.get("slo_compliance", {}).get("_meta")
if isinstance(results.get("slo_compliance"), dict)
else None
)
if slo_meta and slo_meta.get("all_skipped"):
try:
await self._alert(
"governance_slo_data_gap",
{
"status": "warning",
"impact": {
"reason": "all_slo_metrics_not_emitted",
"skipped_count": slo_meta.get("skipped_count", 0),
"all_slo_metrics_not_emitted": True,
},
"remediation": {
"items": [
"補齊 ADR-100 SLO emitterautomation_operation_log_total / post_execution_verification_total / knowledge_entries_total",
"確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫",
],
"next_action": "run_adr100_slo_emit_playbook",
"hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
},
"actionable": {
"items": [
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
],
},
},
)
except Exception:
logger.exception("governance_slo_data_gap_alert_failed")
logger.info("governance_self_check_complete", results=results)
return results
# =========================================================================
# 告警輸出
# =========================================================================
async def _alert(self, event_type: str, payload: dict[str, Any]) -> None:
"""structlog 告警 + PG 持久化 + Telegram 推送via FailoverAlerter
2026-04-26 P2.2 by Claude
2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修P0.1: 補 PG 寫入 ai_governance_events
ADR-085 鐵律AI 學習成果不可存 Cache必須落地 PG
"""
# 1. 寫 PGADR-085 鐵律 — 失敗不阻斷主流程)
try:
from sqlalchemy import insert as _sa_insert
async with get_db_context() as db:
await db.execute(
_sa_insert(AiGovernanceEvent).values(
event_type=event_type,
details=payload,
)
)
await db.commit()
except Exception as _pg_err:
logger.warning("governance_pg_write_failed", error=str(_pg_err))
# 2. structlog保留既有行為
logger.warning("governance_alert", event_type=event_type, **payload)
# Lazy import延遲到實際呼叫時才取 alerter避免啟動時循環依賴
alerter = self._alerter
if alerter is None:
try:
from src.services.failover_alerter import get_failover_alerter
alerter = get_failover_alerter()
except Exception as e:
logger.warning("governance_alerter_get_failed", error=str(e))
return
try:
await alerter.alert_governance(event_type, payload)
except Exception as e:
logger.warning("governance_telegram_alert_failed", error=str(e))
# =============================================================================
# Singleton + 排程迴圈
# =============================================================================
_agent: GovernanceAgent | None = None
def get_governance_agent() -> GovernanceAgent:
"""取得 GovernanceAgent singleton
2026-04-26 P2.2 by Claude
"""
global _agent
if _agent is None:
_agent = GovernanceAgent()
return _agent
def reset_governance_agent() -> None:
"""重置 singleton測試用
2026-04-26 P2.2 by Claude
"""
global _agent
_agent = None
async def run_governance_loop(interval_seconds: int = 3600) -> None:
"""每 1 小時執行一次 GovernanceAgent.run_self_check()
沿用 main.py 的 asyncio.create_task + sleep 迴圈模式(無 APScheduler
coalesce 效果:每次 sleep interval_seconds不堆積多次執行。
2026-04-26 P2.2 by Claude
"""
agent = get_governance_agent()
while True:
try:
await agent.run_self_check()
except Exception as e:
logger.warning("governance_loop_error", error=str(e))
await asyncio.sleep(interval_seconds)