Files
awoooi/apps/api/src/services/governance_agent.py
Your Name d6c904dd0f
All checks were successful
CD Pipeline / tests (push) Successful in 1m19s
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / build-and-deploy (push) Successful in 3m31s
CD Pipeline / post-deploy-checks (push) Successful in 1m29s
fix(api): add quality summary slo metric
2026-06-01 17:00:50 +08:00

1063 lines
45 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""AI 自我治理 Agent
四項自檢,每 1 小時執行一次:
1. trust_drift — Playbook trust_score < 0.2 → 告警建議廢棄
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
5. slo_compliance — 5 個 SLO 合規性檢查ADR-100違反時降級飛輪行為
所有 check 互相隔離try/except任一失敗不阻斷其他項目。
2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 新增 SLO 合規性自檢ADR-100
"""
from __future__ import annotations
import asyncio
from datetime import timedelta
from typing import Any
import structlog
from sqlalchemy import func, select, update
from src.db.base import get_db_context
from src.db.models import (
AiGovernanceEvent,
AutoRepairExecution,
GovernanceRemediationDispatch,
IncidentEvidence,
KnowledgeEntryRecord,
PlaybookRecord,
generate_uuid,
)
from src.models.knowledge import EntryStatus
from src.repositories.governance_remediation_dispatch_repo import (
DispatchAlreadyActive,
create_dispatch,
)
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# =============================================================================
# 閾值常數
# =============================================================================
TRUST_DRIFT_THRESHOLD = 0.2 # playbook trust_score 低於此值 → 告警
# 2026-05-02 ogt + Claude Sonnet 4.6: trust_drift auto-deprecate
# trust < 0.2 + (last_used > N 天前 OR 從沒用過 + 創建超過 N 天) → 自動 deprecate
# 這個 N 設 30 天,給 playbook 充足試用期,避免新提案被早期幾次失敗就廢棄
TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS = 30
KM_STALE_DAYS = 7 # 知識條目超過幾天未更新視為陳舊
KM_STALE_RATIO = 0.20 # 陳舊比例超過此值 → 告警
HALLUCINATION_RATE_THRESHOLD = 0.10 # LLM verification failed 比例超過此值 → 告警
EXECUTION_FAIL_RATE_THRESHOLD = 0.15 # 執行失敗比例超過此值 → 告警
RECENT_LIMIT = 100 # 最近幾筆做統計
GOVERNANCE_SELF_CHECK_LEASE_KEY = "governance:self_check:cycle_lease"
def _slo_remediation_items(name: str) -> list[str]:
if name == "truth_chain_quality_summary_latency":
return [
"Check truth-chain quality summary cache miss latency and DB query plan",
"Confirm operator summary cache is warm before treating homepage SLO as degraded",
]
return [
"Pause auto-scaling or risky auto-fix tasks",
"Review evidence/decision traces and adjust policy thresholds",
]
def _slo_actionable_items(name: str) -> list[str]:
if name == "truth_chain_quality_summary_latency":
return [
"Call /api/v1/platform/truth-chain/quality/summary?limit=8&refresh=true and compare duration",
"Inspect /metrics for awooop_truth_chain_quality_summary_last_duration_seconds",
]
return [
"Check verifier lag and post-exec learning health",
"Run emergency incident audit on failed approvals",
]
def _slo_next_action(name: str) -> str:
if name == "truth_chain_quality_summary_latency":
return "run_truth_chain_quality_summary_latency_probe"
return "trigger_flywheel_safeguard"
# =============================================================================
# GovernanceAgent
# =============================================================================
class GovernanceAgent:
"""AI 自我治理 Agent — 5 項自檢 + 1h 排程
1-4: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
5: slo_complianceADR-100 SLO 合規性)
2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance
"""
def __init__(self, alerter=None) -> None:
# alerter: FailoverAlerter instance可注入預設從 singleton 取得)
self._alerter = alerter
# =========================================================================
# 1. Playbook 信任度漂移
# =========================================================================
async def check_trust_drift(self, emit_alert: bool = True) -> dict[str, Any]:
"""Playbook trust_score < 0.2 → 告警建議廢棄30 天沒用過的直接 auto-deprecate
2026-04-26 P2.2 by Claude
2026-05-02 ogt + Claude Sonnet 4.6: 加 auto_deprecate_low_trust_unused 自治路徑
守衛條件trust < 0.2 AND (last_used_at < 30 天前 OR 從未使用且創建超過 30 天)
→ status 改 'deprecated'alert 改報「N 個告警 + M 個 auto-deprecated」
2026-05-05 Codex: emit_alert=False 供 W-6 watchdog 查詢統計,維持
governance_agent 單一入口,但避免與 hourly self-check 發出雙重 Telegram。
"""
async with get_db_context() as db:
result = await db.execute(
select(PlaybookRecord).where(
PlaybookRecord.status.not_in(["deprecated", "archived"])
)
)
all_records = result.scalars().all()
total = len(all_records)
drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD]
# auto-deprecate eligibility
cutoff = now_taipei() - timedelta(days=TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS)
auto_deprecated_ids: list[str] = []
kept_ids: list[str] = []
for r in drifted:
last = r.last_used_at
created = r.created_at
# 沒用過 → 用 created_at 作為「進入系統時間」
ref_time = last if last is not None else created
if ref_time is not None and ref_time < cutoff:
r.status = "deprecated"
auto_deprecated_ids.append(r.playbook_id)
else:
kept_ids.append(r.playbook_id)
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 1 修復P0 silent failure
# 原 await db.commit() 在 with 區塊外呼叫session 已被 context manager
# 關閉後 auto-commit二次 commit 拋 InvalidRequestError 被外層 try/except 吞掉
# 修法commit 移入 with 區塊內,在 session 有效期間顯式提交
if auto_deprecated_ids:
await db.commit()
logger.info(
"governance_trust_drift_auto_deprecated",
count=len(auto_deprecated_ids),
ids=auto_deprecated_ids[:10],
)
if drifted and emit_alert:
drift_ratio = len(drifted) / total if total > 0 else 0.0
await self._alert(
"trust_drift",
{
"status": "warning",
"impact": {
"drifted_count": len(drifted),
"total_playbooks": total,
"drift_ratio": round(drift_ratio, 3),
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
"remediation": {
"items": [
"Auto-deprecate low-trust stale playbooks",
"Review candidate playbooks by impact scope and rollback if needed",
],
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
},
"actionable": {
"items": [
"立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
"必要時人工覆核 kept_ids 中的高風險 Playbook",
],
"sample_playbook_ids": kept_ids[:10],
},
"drifted_count": len(drifted),
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
"playbook_ids": kept_ids[:10],
"total_playbooks": total,
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
)
logger.info(
"governance_trust_drift_checked",
total=total,
drifted=len(drifted),
auto_deprecated=len(auto_deprecated_ids),
kept=len(kept_ids),
)
drift_ratio = len(drifted) / total if total > 0 else 0.0
return {
"checked": total,
"drifted": len(drifted),
"drift_ratio": drift_ratio,
"auto_deprecated": len(auto_deprecated_ids),
"kept": len(kept_ids),
}
# =========================================================================
# 2. 知識庫衰退
# =========================================================================
async def check_knowledge_degradation(self) -> dict[str, Any]:
"""KM 7 天未更新 > 20% 總量 → 告警知識衰退
2026-04-26 P2.2 by Claude
"""
stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS)
async with get_db_context() as db:
# 非 archived 總數
total_result = await db.execute(
select(func.count()).select_from(KnowledgeEntryRecord).where(
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
)
)
total = total_result.scalar() or 0
# 7 天內未更新updated_at < cutoff且非 archived
stale_result = await db.execute(
select(func.count()).select_from(KnowledgeEntryRecord).where(
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
KnowledgeEntryRecord.updated_at < stale_cutoff,
)
)
stale = stale_result.scalar() or 0
ratio = stale / total if total > 0 else 0.0
if total > 0 and ratio > KM_STALE_RATIO:
if await _has_open_knowledge_degradation_review():
logger.info(
"governance_knowledge_degradation_alert_suppressed",
reason="open_owner_review_exists",
total=total,
stale=stale,
ratio=round(ratio, 3),
)
return {
"total": total,
"stale": stale,
"ratio": round(ratio, 3),
"alert_suppressed": True,
"suppress_reason": "open_owner_review_exists",
}
await self._alert(
"knowledge_degradation",
{
"status": "warning",
"impact": {
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
},
"remediation": {
"items": [
"啟動 KM 反查與自動補齊流程",
"關鍵服務告警自動同步到 KM 任務,補齊缺失條目",
],
"next_action": "run_kb_growth_healthcheck",
},
"ownership": {
"lead_agent": "Hermes",
"lead_reason": "E7 自動 KM 主責:反查 Incident / Sentry / SigNoz / PlayBook產生 KM 更新草稿與任務。",
"support_agents": [
"OpenClaw提供告警分類、規則匹配與 PlayBook 脈絡摘要,不直接批量改寫 KM。",
"ElephantAlpharead-only 稽核高影響 KM 草稿與風險,不執行寫入或通知。",
],
"human_owner": "KM owner / SRE owner",
"human_reason": "審核高影響 KM 後才允許寫入,避免 AI 自動固化錯誤知識。",
},
"actionable": {
"items": [
"每日檢查 ANTI_PATTERN 更新結果",
"安排至少 2 位 owner 對 stale 條目做快速人工審核",
],
},
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
},
)
logger.info(
"governance_knowledge_degradation_checked",
total=total,
stale=stale,
ratio=round(ratio, 3),
)
result = {"total": total, "stale": stale, "ratio": round(ratio, 3)}
if total > 0 and ratio <= KM_STALE_RATIO:
result["resolved_open_events"] = await _resolve_open_knowledge_degradation_events()
return result
# =========================================================================
# 3. LLM 幻覺率
# =========================================================================
async def check_llm_hallucination(self) -> dict[str, Any]:
"""最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警
verification_result 可能值success / degraded / failed / timeout
只有 'failed' 視為幻覺LLM 判斷錯誤導致執行後驗證失敗)
2026-04-26 P2.2 by Claude
"""
async with get_db_context() as db:
# 取最近 RECENT_LIMIT 筆有 verification_result 的記錄
result = await db.execute(
select(IncidentEvidence.verification_result)
.where(IncidentEvidence.verification_result.is_not(None))
.order_by(IncidentEvidence.collected_at.desc())
.limit(RECENT_LIMIT)
)
rows = result.scalars().all()
total = len(rows)
if total == 0:
logger.info("governance_hallucination_checked", total=0, rate=0.0)
return {"total": 0, "failed": 0, "rate": 0.0}
failed = sum(1 for r in rows if r == "failed")
rate = failed / total
if rate > HALLUCINATION_RATE_THRESHOLD:
await self._alert(
"llm_hallucination",
{
"status": "warning",
"impact": {
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
"threshold": HALLUCINATION_RATE_THRESHOLD,
},
"remediation": {
"items": [
"檢核 AI 建議來源與 evidence snapshot 一致性",
"檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
],
"next_action": "run_knowledge_gap_audit",
"hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
},
"actionable": {
"items": [
"啟動 `playbook_evidence` 對齊補償流程",
"調整 verify timeout 與降級策略,避免過度信任低品質證據",
],
},
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
"threshold": HALLUCINATION_RATE_THRESHOLD,
},
)
logger.info(
"governance_hallucination_checked",
total=total,
failed=failed,
rate=round(rate, 3),
)
return {"total": total, "failed": failed, "rate": round(rate, 3)}
# =========================================================================
# 4. 執行失敗率 (Blast Radius)
# =========================================================================
async def check_execution_blast_radius(self) -> dict[str, Any]:
"""最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警
2026-04-26 P2.2 by Claude
"""
async with get_db_context() as db:
result = await db.execute(
select(AutoRepairExecution.success)
.order_by(AutoRepairExecution.created_at.desc())
.limit(RECENT_LIMIT)
)
rows = result.scalars().all()
total = len(rows)
if total == 0:
logger.info("governance_blast_radius_checked", total=0, rate=0.0)
return {"total": 0, "failed": 0, "rate": 0.0}
failed = sum(1 for r in rows if not r)
rate = failed / total
if rate > EXECUTION_FAIL_RATE_THRESHOLD:
await self._alert(
"execution_blast_radius",
{
"status": "warning",
"impact": {
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
},
"remediation": {
"items": [
"鎖定失敗 playbook 清單,關閉高風險自動執行",
"比對 incident evidence 與 post_execution_verification 失敗原因",
],
"next_action": "pause_auto_repair_for_top_failing_playbooks",
"hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
},
"actionable": {
"items": [
"跑 `run_self_check` 快照與失敗 playbook 熱點報表",
"必要時啟用 emergency fallback 路由進人工審核",
],
},
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
},
)
logger.info(
"governance_blast_radius_checked",
total=total,
failed=failed,
rate=round(rate, 3),
)
return {"total": total, "failed": failed, "rate": round(rate, 3)}
# =========================================================================
# 5. SLO 合規性ADR-100
# =========================================================================
async def check_slo_compliance(self) -> dict[str, Any]:
"""SLO 5 項合規性檢查 — 違反時降級飛輪行為
從 Prometheus Recording rules 讀取 SLI 值,
與硬紅線閾值比對,違反時呼叫 _alert() 寫 PG + 推 Telegram。
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
SLO 5 總覽延遲: awooop_truth_chain_quality_summary_last_duration_seconds 硬紅線 > 8s
2026-04-27 P3.4 by Claude — AI SLOADR-100
"""
import math
import httpx
from src.core.config import settings
prom_url = getattr(settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090")
queries: dict[str, str] = {
"autonomy_rate": "sli:autonomy_rate:5m",
"decision_accuracy": "sli:decision_accuracy:5m",
"confidence_calibration": "sli:confidence_calibration:1h",
"km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
"truth_chain_quality_summary_latency": 'max(awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",limit="8",success="true"})',
}
# 硬紅線above 指標低於此值、below 指標高於此值時必須告警(非軟性警告)
hard_red_lines: dict[str, float] = {
"autonomy_rate": 0.70,
"decision_accuracy": 0.85,
"confidence_calibration": 0.70,
"km_growth_rate": 5.0,
"truth_chain_quality_summary_latency": 8.0,
}
# SLO 目標值(供日誌記錄)
slo_targets: dict[str, float] = {
"autonomy_rate": 0.80,
"decision_accuracy": 0.90,
"confidence_calibration": 0.80,
"km_growth_rate": 20.0,
"truth_chain_quality_summary_latency": 2.0,
}
slo_directions: dict[str, str] = {
"autonomy_rate": "above",
"decision_accuracy": "above",
"confidence_calibration": "above",
"km_growth_rate": "above",
"truth_chain_quality_summary_latency": "below",
}
results: dict[str, Any] = {}
async with httpx.AsyncClient(timeout=5.0) as client:
for name, query in queries.items():
try:
resp = await client.get(
f"{prom_url}/api/v1/query",
params={"query": query},
)
data = resp.json()
if data.get("status") == "success":
result_list = data.get("data", {}).get("result", [])
# 2026-04-28 ogt + Claude Opus 4.7: P0-1 假警報止血
# 空 result = Prometheus 查無資料metric 未 emit / rule 未部署),不等於 SLO=0
# ADR-100 emitter 全部尚未實作automation_operation_log_total 等 4 個 counter 零定義)
# 不可 fallback 0.0,否則必觸發 violated=True 噴假警報
if not result_list:
results[name] = {
"name": name,
"status": "skipped",
"error": "no_data",
"reason": "prometheus_empty_result_metric_not_emitted",
"hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載",
}
logger.warning(
"governance_slo_no_data",
slo=name,
query=query,
hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
)
continue
value = float(result_list[0]["value"][1])
if not math.isfinite(value):
results[name] = {
"name": name,
"status": "skipped",
"error": "non_finite_value",
"reason": "prometheus_nan_or_inf",
"hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估",
}
logger.warning(
"governance_slo_non_finite",
slo=name,
query=query,
value=str(result_list[0]["value"][1]),
)
continue
threshold = hard_red_lines[name]
target = slo_targets[name]
direction = slo_directions.get(name, "above")
violated = value > threshold if direction == "below" else value < threshold
gap = (
value - threshold
if violated and direction == "below"
else threshold - value
if violated
else target - value
if direction == "below"
else value - target
)
results[name] = {
"name": name,
"status": "violated" if violated else "ok",
"value": round(value, 4),
"slo_target": target,
"hard_red_line": threshold,
"direction": direction,
"gap": round(gap, 4),
"violated": violated,
}
if violated:
await self._alert(
f"slo_{name}_violation",
{
"status": "violation",
"impact": {
"name": name,
"value": round(value, 4),
"target": target,
"threshold": threshold,
"direction": direction,
"gap": round(gap, 4),
},
"remediation": {
"items": _slo_remediation_items(name),
"next_action": _slo_next_action(name),
},
"actionable": {
"items": _slo_actionable_items(name),
},
},
)
logger.warning(
"governance_slo_violated",
slo=name,
value=round(value, 4),
hard_red_line=threshold,
)
elif value == 0 and threshold <= 0:
logger.warning(
"governance_slo_unexpected_zero",
slo=name,
value=round(value, 4),
)
else:
logger.info(
"governance_slo_ok",
slo=name,
value=round(value, 4),
target=target,
)
else:
results[name] = {
"name": name,
"status": "error",
"error": "prometheus_query_failed",
"response_status": data.get("status"),
}
logger.warning(
"governance_slo_prometheus_error",
slo=name,
query=query,
response_status=data.get("status"),
)
except Exception as e:
results[name] = {
"name": name,
"status": "error",
"error": str(e),
}
logger.warning("governance_slo_check_error", slo=name, error=str(e))
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
# 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
# 防止 dashboard 把 no_data 當 pass 顯示
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
ok_count = sum(
1 for v in results.values()
if isinstance(v, dict)
and v.get("status") == "ok"
)
error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
results["_meta"] = {
"violated_count": violated_count,
"skipped_count": skipped_count,
"ok_count": ok_count,
"error_count": error_count,
"all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
"all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
"status": (
"no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
else "violated" if violated_count > 0
else "ok"
),
}
logger.info(
"governance_slo_compliance_complete",
results=results,
violated=violated_count,
skipped=skipped_count,
ok=ok_count,
status=results["_meta"]["status"],
)
return results
# =========================================================================
# 全跑exception 隔離)
# =========================================================================
async def run_self_check(self) -> dict[str, Any]:
"""5 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
2026-04-26 P2.2 by Claude
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_complianceADR-100
"""
results: dict[str, Any] = {}
checks = [
("trust_drift", self.check_trust_drift),
("knowledge_degradation", self.check_knowledge_degradation),
("llm_hallucination", self.check_llm_hallucination),
("execution_blast_radius", self.check_execution_blast_radius),
("slo_compliance", self.check_slo_compliance),
]
for check_name, check_func in checks:
try:
results[check_name] = await check_func()
except Exception as e:
logger.exception(
"governance_check_failed",
check=check_name,
error=str(e),
)
results[check_name] = {"error": str(e)}
# 2026-04-27 Wave8-X3 by Claude — B8 全失敗聚合告警
# ≥3 項失敗代表治理機制本身故障,必須送出緊急告警
failed_checks = [k for k, v in results.items() if isinstance(v, dict) and "error" in v]
if len(failed_checks) >= 3:
try:
await self._alert(
"governance_self_failure",
{
"status": "critical",
"impact": {
"failed_checks": failed_checks,
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
"errors": {k: results[k].get("error") for k in failed_checks},
},
"remediation": {
"items": [
"暫停非關鍵治理自動化接收鏈路",
"聚焦治理執行路徑錯誤並補齊 fallback",
],
"next_action": "investigate_governance_pipeline_health",
},
"actionable": {
"items": [
"檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
"確認 DB 寫入與 Prometheus fetch 未被上游干擾",
],
},
},
)
except Exception:
logger.exception("governance_self_failure_alert_failed")
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
# SLO 全 skipped 是「資料未產生」emitter 未實作)不是「治理機制故障」
# 用獨立 alert 區分,避免污染 self_failure 計數
slo_meta = (
results.get("slo_compliance", {}).get("_meta")
if isinstance(results.get("slo_compliance"), dict)
else None
)
if slo_meta and slo_meta.get("all_skipped"):
try:
await self._alert(
"governance_slo_data_gap",
{
"status": "warning",
"impact": {
"reason": "all_slo_metrics_not_emitted",
"skipped_count": slo_meta.get("skipped_count", 0),
"all_slo_metrics_not_emitted": True,
},
"remediation": {
"items": [
"補齊 ADR-100 SLO emitterautomation_operation_log_total / post_execution_verification_total / knowledge_entries_total",
"確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫",
],
"next_action": "run_adr100_slo_emit_playbook",
"hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
},
"actionable": {
"items": [
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
"檢查 Prometheus rule 與 truth-chain quality summary runtime metric 是否可查詢",
],
},
},
)
except Exception:
logger.exception("governance_slo_data_gap_alert_failed")
logger.info("governance_self_check_complete", results=results)
return results
# =========================================================================
# 告警輸出
# =========================================================================
async def _alert(self, event_type: str, payload: dict[str, Any]) -> None:
"""structlog 告警 + PG 持久化 + Telegram 推送via FailoverAlerter
2026-04-26 P2.2 by Claude
2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修P0.1: 補 PG 寫入 ai_governance_events
ADR-085 鐵律AI 學習成果不可存 Cache必須落地 PG
"""
# 1. 寫 PGADR-085 鐵律 — 失敗不阻斷主流程)
event_id = generate_uuid()
pg_written = False
try:
from sqlalchemy import insert as _sa_insert
async with get_db_context() as db:
await db.execute(
_sa_insert(AiGovernanceEvent).values(
id=event_id,
event_type=event_type,
details=payload,
)
)
await db.commit()
pg_written = True
except Exception as _pg_err:
logger.warning("governance_pg_write_failed", error=str(_pg_err))
if pg_written:
await _maybe_create_intake_dispatch(event_id, event_type, payload)
# 2. structlog保留既有行為
logger.warning("governance_alert", event_type=event_type, **payload)
# Lazy import延遲到實際呼叫時才取 alerter避免啟動時循環依賴
alerter = self._alerter
if alerter is None:
try:
from src.services.failover_alerter import get_failover_alerter
alerter = get_failover_alerter()
except Exception as e:
logger.warning("governance_alerter_get_failed", error=str(e))
return
try:
await alerter.alert_governance(event_type, payload)
except Exception as e:
logger.warning("governance_telegram_alert_failed", error=str(e))
async def _has_open_knowledge_degradation_review() -> bool:
"""已有 Hermes owner-review 工單時,不再重複建立 KM stale 告警。
多個 API Pod 會同時啟動 governance loop同一個 stale ratio 若已經
進入 Hermes review draft就應視為「同一個未結治理工作」避免
Telegram / Work Items 每輪產生新的治理事件與 REVIEW 草稿。
"""
try:
async with get_db_context() as db:
result = await db.execute(
select(GovernanceRemediationDispatch.id)
.join(
AiGovernanceEvent,
GovernanceRemediationDispatch.governance_event_id == AiGovernanceEvent.id,
)
.where(AiGovernanceEvent.event_type == "knowledge_degradation")
.where(AiGovernanceEvent.resolved.is_(False))
.where(GovernanceRemediationDispatch.event_type == "knowledge_degradation")
.where(GovernanceRemediationDispatch.executor_type == "hermes_kb_growth_healthcheck")
.where(
GovernanceRemediationDispatch.dispatch_status.in_(
["pending", "dispatched", "executing", "succeeded"]
)
)
.order_by(GovernanceRemediationDispatch.dispatched_at.desc())
.limit(1)
)
return result.scalar_one_or_none() is not None
except Exception as exc:
logger.warning(
"governance_knowledge_degradation_review_lookup_failed_fail_open",
error=str(exc),
)
return False
async def _resolve_open_knowledge_degradation_events() -> int:
"""KM stale ratio 回到門檻內時,收斂未解治理事件。"""
try:
async with get_db_context() as db:
result = await db.execute(
update(AiGovernanceEvent)
.where(AiGovernanceEvent.event_type == "knowledge_degradation")
.where(AiGovernanceEvent.resolved.is_(False))
.values(resolved=True, resolved_at=now_taipei())
.execution_options(synchronize_session=False)
)
resolved_count = int(result.rowcount or 0)
if resolved_count:
await db.commit()
logger.info(
"governance_knowledge_degradation_resolved",
resolved_count=resolved_count,
reason="stale_ratio_recovered",
)
return resolved_count
except Exception as exc:
logger.warning(
"governance_knowledge_degradation_resolve_failed",
error=str(exc),
)
return 0
async def _maybe_create_intake_dispatch(
event_id: str,
event_type: str,
payload: dict[str, Any],
) -> None:
"""把可行動治理告警同步轉成 non-executing dispatch work item。
這層只建立可追蹤派工,不執行修復、不寫 KM、不發額外通知。
後續 Hermes KB growth worker / GovernanceDispatcher 可以接續推進狀態。
"""
if event_type != "knowledge_degradation":
return
try:
await create_dispatch(
event_id=event_id,
event_type=event_type,
executor_type="hermes_kb_growth_healthcheck",
decision_context=_build_knowledge_degradation_dispatch_context(event_id, payload),
max_attempts=1,
created_by="governance_agent_intake",
)
except DispatchAlreadyActive:
logger.info(
"governance_intake_dispatch_already_active",
event_id=event_id,
event_type=event_type,
)
except Exception as exc:
logger.warning(
"governance_intake_dispatch_failed",
event_id=event_id,
event_type=event_type,
error=str(exc),
)
def _build_knowledge_degradation_dispatch_context(
event_id: str,
payload: dict[str, Any],
) -> dict[str, Any]:
impact = payload.get("impact") if isinstance(payload.get("impact"), dict) else {}
remediation = payload.get("remediation") if isinstance(payload.get("remediation"), dict) else {}
ownership = payload.get("ownership") if isinstance(payload.get("ownership"), dict) else {}
next_action = remediation.get("next_action")
if not isinstance(next_action, str) or not next_action:
next_action = "run_kb_growth_healthcheck"
return {
"version": "v1",
"trigger_source": "governance_agent",
"triggered_metric": "knowledge_degradation",
"metric_value": impact.get("stale_ratio"),
"threshold": impact.get("threshold"),
"suggested_action": next_action,
"next_action": next_action,
"decision_path": "pending_owner_review",
"ownership": ownership,
"affected_resources": ["knowledge_entries"],
"workflow": {
"work_item_id": f"governance:knowledge_degradation:{event_id}",
"work_kind": "kb_growth_healthcheck",
"current_stage": "queued_kb_healthcheck",
"steps": [
"detected",
"ai_analyzed",
"queued_kb_healthcheck",
"draft_km_updates",
"waiting_owner_review",
"km_writeback_after_approval",
"stale_ratio_recheck",
],
"stage_by_dispatch_status": {
"pending": "queued_kb_healthcheck",
"dispatched": "queued_kb_healthcheck",
"executing": "draft_km_updates",
"succeeded": "stale_ratio_recheck",
"failed": "needs_manual_km_triage",
"skipped": "waiting_owner_review",
"cancelled": "cancelled",
},
"next_action": next_action,
"needs_human_review": True,
"writes_km_without_approval": False,
"impact": impact,
},
"extra": {
"event_id": event_id,
"stale_count": impact.get("stale_count"),
"total_count": impact.get("total_count"),
"stale_days": impact.get("stale_days"),
"ownership": ownership,
},
}
# =============================================================================
# Singleton + 排程迴圈
# =============================================================================
_agent: GovernanceAgent | None = None
def get_governance_agent() -> GovernanceAgent:
"""取得 GovernanceAgent singleton
2026-04-26 P2.2 by Claude
"""
global _agent
if _agent is None:
_agent = GovernanceAgent()
return _agent
def reset_governance_agent() -> None:
"""重置 singleton測試用
2026-04-26 P2.2 by Claude
"""
global _agent
_agent = None
async def run_governance_loop(interval_seconds: int = 3600) -> None:
"""每 1 小時執行一次 GovernanceAgent.run_self_check()
沿用 main.py 的 asyncio.create_task + sleep 迴圈模式(無 APScheduler
coalesce 效果:每次 sleep interval_seconds不堆積多次執行。
2026-04-26 P2.2 by Claude
"""
agent = get_governance_agent()
while True:
try:
if await _try_acquire_governance_self_check_lease(interval_seconds):
await agent.run_self_check()
else:
logger.debug(
"governance_self_check_cycle_skipped",
reason="cycle_lease_held",
)
except Exception as e:
logger.warning("governance_loop_error", error=str(e))
await asyncio.sleep(interval_seconds)
async def _try_acquire_governance_self_check_lease(interval_seconds: int) -> bool:
"""跨 API Pod 的 self-check 週期租約。
這是週期 cooldown不是 critical-section lock取得後不主動 release。
TTL 到期前其他 replica 只略過本輪,避免同一治理狀態被多個 Pod 寫成
多筆事件、多張 Hermes KM 草稿。
"""
ttl = max(60, int(interval_seconds))
try:
from src.core.redis_client import get_redis
redis = get_redis()
acquired = await redis.set(
GOVERNANCE_SELF_CHECK_LEASE_KEY,
"1",
ex=ttl,
nx=True,
)
return bool(acquired)
except Exception as exc:
logger.warning(
"governance_self_check_lease_unavailable_fail_open",
error=str(exc),
)
return True