feat(governance): emit adr100 slo metrics

2026-05-14 18:57:03 +08:00
parent 1670ff1960
commit 13cf02b740
7 changed files with 334 additions and 9 deletions
--- a/apps/api/src/main.py
+++ b/apps/api/src/main.py
@@ -76,13 +76,13 @@ from src.api.v1 import terminal as terminal_v1  # Phase 19.1: Omni-Terminal SSE
 from src.api.v1 import timeline as timeline_v1
 from src.api.v1 import webhooks as webhooks_v1
 from src.core.config import settings
-from src.core.feature_flags import aiops_flags  # ADR-080: AI 自主化飛輪 feature flags 啟動驗證
 from src.core.http_client import close_all_http_clients, init_all_http_clients
 from src.core.logging import get_logger, setup_logging
 from src.core.redis_client import close_redis_pool, init_redis_pool
-from src.services.flywheel_stats_service import get_flywheel_stats_service
 from src.core.sse import get_publisher
 from src.core.telemetry import setup_telemetry, shutdown_telemetry
+from src.services.adr100_slo_metrics_service import get_adr100_slo_metrics_service
+from src.services.flywheel_stats_service import get_flywheel_stats_service

 # CTO-201: Database & Executor
 from src.db.base import close_db, init_db
@@ -554,7 +554,6 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
    # 2026-04-27 P3.1-T3 by Claude
    try:
        from src.utils.timezone import now_taipei
-        from datetime import datetime as _dt

        async def _run_kb_rot_cleaner_loop() -> None:
            from src.jobs.kb_rot_cleaner import get_kb_rot_cleaner
@@ -1016,6 +1015,13 @@ async def prometheus_metrics() -> Response:
        content += flywheel_metrics.to_prometheus_lines()
    except Exception:
        logger.warning("prometheus_metrics_flywheel_error")
+    # 2026-05-14 Codex — T18 ADR-100 SLO emitter
+    # GovernanceAgent 讀 Prometheus recording rules；若 /metrics 不吐底層 DB totals，
+    # sli:* rules 會全空並每小時重複發 governance_slo_data_gap。
+    try:
+        content += await get_adr100_slo_metrics_service().to_prometheus_lines()
+    except Exception as exc:
+        logger.warning("prometheus_metrics_adr100_slo_error", error=str(exc))
    return Response(content=content, media_type=CONTENT_TYPE_LATEST)


--- a/apps/api/src/services/adr100_slo_metrics_service.py
+++ b/apps/api/src/services/adr100_slo_metrics_service.py
@@ -0,0 +1,217 @@
+"""
+ADR-100 SLO metrics emitter.
+
+Prometheus recording rules for the AI flywheel SLOs expect a small set of
+counter-like metrics. The source of truth already lives in PostgreSQL, so this
+read-side emitter exposes DB totals on /metrics without changing runtime write
+paths or introducing another state store.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from time import time
+
+from sqlalchemy import text
+
+from src.db.base import get_db_context
+
+
+@dataclass(frozen=True)
+class AutomationOperationSample:
+    outcome: str
+    operation_type: str
+    count: int
+
+
+@dataclass(frozen=True)
+class VerificationSample:
+    outcome: str
+    count: int
+
+
+@dataclass(frozen=True)
+class Adr100SloMetricsSnapshot:
+    automation_operations: list[AutomationOperationSample] = field(default_factory=list)
+    post_execution_verifications: list[VerificationSample] = field(default_factory=list)
+    knowledge_entries_total: int = 0
+    high_confidence_total: int = 0
+    high_confidence_success_total: int = 0
+    emitted_at: float = field(default_factory=time)
+
+
+class Adr100SloMetricsService:
+    """Build ADR-100 Prometheus samples from production DB state."""
+
+    async def to_prometheus_lines(self) -> str:
+        snapshot = await self.fetch_snapshot()
+        return render_adr100_slo_metrics(snapshot)
+
+    async def fetch_snapshot(self) -> Adr100SloMetricsSnapshot:
+        async with get_db_context() as db:
+            automation_rows = (
+                await db.execute(text(_AUTOMATION_OPERATION_SQL))
+            ).fetchall()
+            verification_rows = (
+                await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
+            ).fetchall()
+            knowledge_total = int(
+                (await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
+                or 0
+            )
+            confidence_row = (
+                await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
+            ).one()
+
+        return Adr100SloMetricsSnapshot(
+            automation_operations=[
+                AutomationOperationSample(
+                    outcome=str(row.outcome),
+                    operation_type=str(row.operation_type),
+                    count=int(row.count or 0),
+                )
+                for row in automation_rows
+            ],
+            post_execution_verifications=[
+                VerificationSample(
+                    outcome=str(row.outcome),
+                    count=int(row.count or 0),
+                )
+                for row in verification_rows
+            ],
+            knowledge_entries_total=knowledge_total,
+            high_confidence_total=int(confidence_row.high_confidence_total or 0),
+            high_confidence_success_total=int(
+                confidence_row.high_confidence_success_total or 0
+            ),
+        )
+
+
+def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
+    """Render ADR-100 SLO metrics in Prometheus text exposition format."""
+    lines: list[str] = [
+        "",
+        "# HELP automation_operation_log_total DB-derived AI automation operation count for ADR-100 SLOs",
+        "# TYPE automation_operation_log_total counter",
+    ]
+    if snapshot.automation_operations:
+        for sample in snapshot.automation_operations:
+            lines.append(
+                "automation_operation_log_total"
+                f'{{outcome="{_escape_label(sample.outcome)}",'
+                f'operation_type="{_escape_label(sample.operation_type)}"}} '
+                f"{sample.count}"
+            )
+    else:
+        lines.append(
+            'automation_operation_log_total{outcome="none",operation_type="none"} 0'
+        )
+
+    lines.extend([
+        "# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
+        "# TYPE post_execution_verification_total counter",
+    ])
+    if snapshot.post_execution_verifications:
+        for sample in snapshot.post_execution_verifications:
+            lines.append(
+                "post_execution_verification_total"
+                f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
+            )
+    else:
+        lines.append('post_execution_verification_total{outcome="none"} 0')
+
+    lines.extend([
+        "# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
+        "# TYPE knowledge_entries_total counter",
+        f"knowledge_entries_total {snapshot.knowledge_entries_total}",
+        "# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
+        "# TYPE approval_records_high_confidence_total counter",
+        f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
+        "# HELP approval_records_high_confidence_success_total DB-derived high confidence approval decisions with successful verification for ADR-100 SLOs",
+        "# TYPE approval_records_high_confidence_success_total counter",
+        (
+            "approval_records_high_confidence_success_total "
+            f"{snapshot.high_confidence_success_total}"
+        ),
+        "# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp",
+        "# TYPE adr100_slo_emitter_last_success_timestamp gauge",
+        f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}",
+        "",
+    ])
+    return "\n".join(lines)
+
+
+def _escape_label(value: str) -> str:
+    return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
+
+
+_AUTOMATION_OPERATION_SQL = """
+    SELECT
+        CASE
+            WHEN status <> 'success' THEN status
+            WHEN actor = 'approval_execution'
+                 AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
+                THEN 'human_required'
+            ELSE 'auto_executed'
+        END AS outcome,
+        operation_type,
+        count(*) AS count
+    FROM automation_operation_log
+    GROUP BY outcome, operation_type
+    ORDER BY outcome, operation_type
+"""
+
+
+_POST_EXECUTION_VERIFICATION_SQL = """
+    SELECT verification_result AS outcome, count(*) AS count
+    FROM incident_evidence
+    WHERE verification_result IS NOT NULL
+    GROUP BY verification_result
+    ORDER BY verification_result
+"""
+
+
+_HIGH_CONFIDENCE_APPROVAL_SQL = """
+    WITH approval_confidence AS (
+        SELECT
+            id,
+            incident_id,
+            COALESCE(
+                CASE
+                    WHEN extra_metadata->>'confidence_score' ~ '^[0-9]+(\\.[0-9]+)?$'
+                        THEN (extra_metadata->>'confidence_score')::numeric
+                    ELSE NULL
+                END,
+                CASE
+                    WHEN extra_metadata->>'confidence' ~ '^[0-9]+(\\.[0-9]+)?$'
+                        THEN (extra_metadata->>'confidence')::numeric
+                    ELSE NULL
+                END,
+                composite_score,
+                0
+            ) AS confidence
+        FROM approval_records
+    )
+    SELECT
+        count(*) FILTER (WHERE confidence >= 0.8) AS high_confidence_total,
+        count(*) FILTER (
+            WHERE confidence >= 0.8
+              AND EXISTS (
+                  SELECT 1
+                  FROM incident_evidence ev
+                  WHERE ev.incident_id = approval_confidence.incident_id
+                    AND ev.verification_result = 'success'
+              )
+        ) AS high_confidence_success_total
+    FROM approval_confidence
+"""
+
+
+_adr100_slo_metrics_service: Adr100SloMetricsService | None = None
+
+
+def get_adr100_slo_metrics_service() -> Adr100SloMetricsService:
+    global _adr100_slo_metrics_service
+    if _adr100_slo_metrics_service is None:
+        _adr100_slo_metrics_service = Adr100SloMetricsService()
+    return _adr100_slo_metrics_service
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -447,13 +447,13 @@ class GovernanceAgent:
                                "status": "skipped",
                                "error": "no_data",
                                "reason": "prometheus_empty_result_metric_not_emitted",
-                                "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
+                                "hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入，或 multiprocess 目錄未掛載",
                            }
                            logger.warning(
                                "governance_slo_no_data",
                                slo=name,
                                query=query,
-                                hint="ADR-100 emitter not yet implemented",
+                                hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
                            )
                            continue
                        value = float(result_list[0]["value"][1])
@@ -655,15 +655,15 @@ class GovernanceAgent:
                        },
                        "remediation": {
                            "items": [
-                                "補齊 ADR-100 SLO emitter（automation_operation_log_total / post_execution_verification_total / km_entries_total）",
-                                "設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄（如 emptyDir）",
+                                "補齊 ADR-100 SLO emitter（automation_operation_log_total / post_execution_verification_total / knowledge_entries_total）",
+                                "確認 Prometheus recording rules 已載入，且 API Pod multiprocess 目錄可寫",
                            ],
                            "next_action": "run_adr100_slo_emit_playbook",
-                            "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
+                            "hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
                        },
                        "actionable": {
                            "items": [
-                                "先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
+                                "先確認 /metrics 是否已輸出 ADR-100 底層指標",
                                "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
                            ],
                        },
--- a/apps/api/tests/test_adr100_slo_metrics_service.py
+++ b/apps/api/tests/test_adr100_slo_metrics_service.py
@@ -0,0 +1,71 @@
+from src.services.adr100_slo_metrics_service import (
+    Adr100SloMetricsSnapshot,
+    AutomationOperationSample,
+    VerificationSample,
+    render_adr100_slo_metrics,
+)
+
+
+def test_render_adr100_slo_metrics_outputs_required_series() -> None:
+    snapshot = Adr100SloMetricsSnapshot(
+        automation_operations=[
+            AutomationOperationSample(
+                outcome="auto_executed",
+                operation_type="playbook_executed",
+                count=8,
+            ),
+            AutomationOperationSample(
+                outcome="human_required",
+                operation_type="playbook_executed",
+                count=2,
+            ),
+        ],
+        post_execution_verifications=[
+            VerificationSample(outcome="success", count=7),
+            VerificationSample(outcome="failed", count=1),
+        ],
+        knowledge_entries_total=2161,
+        high_confidence_total=9,
+        high_confidence_success_total=7,
+        emitted_at=1_778_756_000,
+    )
+
+    rendered = render_adr100_slo_metrics(snapshot)
+
+    assert (
+        'automation_operation_log_total{outcome="auto_executed",'
+        'operation_type="playbook_executed"} 8'
+    ) in rendered
+    assert 'post_execution_verification_total{outcome="success"} 7' in rendered
+    assert "knowledge_entries_total 2161" in rendered
+    assert "approval_records_high_confidence_total 9" in rendered
+    assert "approval_records_high_confidence_success_total 7" in rendered
+    assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
+
+
+def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
+    rendered = render_adr100_slo_metrics(
+        Adr100SloMetricsSnapshot(emitted_at=1_778_756_000),
+    )
+
+    assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered
+    assert 'post_execution_verification_total{outcome="none"} 0' in rendered
+    assert "knowledge_entries_total 0" in rendered
+
+
+def test_render_adr100_slo_metrics_escapes_labels() -> None:
+    rendered = render_adr100_slo_metrics(
+        Adr100SloMetricsSnapshot(
+            automation_operations=[
+                AutomationOperationSample(
+                    outcome='auto"executed',
+                    operation_type="line\nbreak",
+                    count=1,
+                ),
+            ],
+            emitted_at=1_778_756_000,
+        ),
+    )
+
+    assert 'outcome="auto\\"executed"' in rendered
+    assert 'operation_type="line\\nbreak"' in rendered
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,25 @@
+## 2026-05-14 | T18 ADR-100 SLO emitter 接入，治理資料缺口告警轉為可驗證指標
+
+**背景**：Telegram 反覆出現「AI 治理警報｜SLO 資料缺口」，但訊息只能說 `all_slo_metrics_not_emitted`，無法讓 Operator 判斷是 Pod 掛載、Prometheus rule、還是 emitter 本身缺失。Production 查核確認 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR` 與 `emptyDir` 掛載，真正缺口是 `/metrics` 沒有輸出 ADR-100 recording rules 所需的底層 series。
+
+**修正**：
+- 新增 `adr100_slo_metrics_service.py`，從 PostgreSQL 事實來源產出 DB-derived Prometheus 指標：`automation_operation_log_total`、`post_execution_verification_total`、`knowledge_entries_total`、`approval_records_high_confidence_total`、`approval_records_high_confidence_success_total`。
+- `/metrics` 追加 ADR-100 SLO emitter，不新增 DB schema、不改 Prometheus scrape target，讓既有 `awoooi-api` scrape job 可直接取得底層 series。
+- `GovernanceAgent` 的 SLO no-data hint 改成 emitter / recording rule / multiprocess mount 三段式，不再把已驗證存在的 `PROMETHEUS_MULTIPROC_DIR` 當成單一原因。
+- 清理 `main.py` 兩個既有未使用 import（`aiops_flags`、`_dt`），避免本次觸碰檔案繼續帶 F401 技術債。
+
+**本地驗證**：
+- `python3 -m py_compile apps/api/src/services/adr100_slo_metrics_service.py apps/api/src/services/governance_agent.py apps/api/src/main.py apps/api/tests/test_adr100_slo_metrics_service.py`：pass。
+- `pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`：47 passed。
+- `ruff check --select F,E9 src/services/adr100_slo_metrics_service.py src/services/governance_agent.py src/main.py tests/test_adr100_slo_metrics_service.py`：pass。
+- `git diff --check`：pass。
+- Production SQL dry-run：automation / verification / knowledge / high-confidence approval 查詢均可在現有 schema 上執行。
+
+**目前整體進度**：
+- Alertmanager 低風險自動修復主線：約 96%。
+- 完整 AI 自動化管理產品化：約 78%。
+- T18 正在推版；推版後需等 Prometheus scrape / recording rule evaluation，再確認 `sli:*` 不再全空，並觀察 `governance_slo_data_gap` 是否停止重複推播。
+
 ## 2026-05-14 | T17b 治理事件 / dispatch API 查詢修復，解除前端工作鏈路紅燈

 **背景**：T17A production smoke 顯示 `/awooop/work-items` 可見治理 dispatch 阻塞，但 API 層本身仍有兩個紅燈：`GET /api/v1/ai/governance/events?...` 回 500，`GET /api/v1/ai/governance/queue?dispatch_status=pending` 回 `table_pending=true`。統帥要求前端要能呈現完整流程，不能讓治理告警與 dispatch 階段停在 API 黑盒。
--- a/docs/adr/ADR-100-ai-autonomous-slo.md
+++ b/docs/adr/ADR-100-ai-autonomous-slo.md
@@ -158,6 +158,8 @@ increase(knowledge_entries_total[24h])
 | `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 |
 | `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard |
 | `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 |
+| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18：從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series |
+| `apps/api/src/main.py` `/metrics` | 2026-05-14 T18：追加 DB-derived SLO emitter，讓既有 `awoooi-api` scrape job 取得底層 series |

 ## 決策理由

--- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
+++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
@@ -2125,6 +2125,13 @@ Phase 6 完成後
 - Production deploy：`08d28dc4` 與 enum cast hotfix `6220f522` 已推 Gitea main；Code Review runs `2151` / `2153` success；CD runs `2150` / `2152` success；最新 deploy marker `9b32d3a9 chore(cd): deploy 6220f52 [skip ci]`；API / Worker / Web image 均為 `6220f5226693330a378f363202bd79065ab7fc34`；`governance/events` 200、`governance/queue` 200 且 `table_pending=false`；`/zh-TW/awooop/work-items` 200。
 - 目前進度更新：Alertmanager 低風險自動修復主線約 96%；完整 AI 自動化管理產品化約 76%。下一段收斂 governance dispatcher skipped reason / leader-dedupe / ADR-100 SLO emitter，並把治理 dispatch 階段完整呈現在 Operator Console。

+**T18 ADR-100 SLO emitter 接入（2026-05-14 台北）**：
+- 觸發：治理告警 `governance_slo_data_gap` 反覆推 Telegram，但 production 查核顯示 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR` 與 `emptyDir`，真正缺口是 `/metrics` 未輸出 ADR-100 recording rules 所需底層 series，導致 `sli:*` 全部 empty result。
+- 修正：新增 DB-derived `/metrics` emitter，從 `automation_operation_log`、`incident_evidence`、`knowledge_entries`、`approval_records` 暴露 `automation_operation_log_total`、`post_execution_verification_total`、`knowledge_entries_total`、`approval_records_high_confidence_total`、`approval_records_high_confidence_success_total`；不新增 schema、不改 scrape target。
+- 訊息治理：`GovernanceAgent` no-data hint 改為 emitter / recording rule / multiprocess mount 三段式，避免 Operator 被誤導成只有 `PROMETHEUS_MULTIPROC_DIR` 未設。
+- 驗證：`py_compile` pass；`pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q` 47 passed；ruff F/E9 pass；diff check pass；production SQL dry-run 通過。
+- 目前進度更新：Alertmanager 低風險自動修復主線約 96%；完整 AI 自動化管理產品化約 78%。推版後需等 Prometheus scrape / recording rule evaluation，再確認 `sli:*` 不再全空。
+
 ---

 ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護（commit de2d34d）