feat(governance): emit adr100 slo metrics
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m0s
CD Pipeline / build-and-deploy (push) Successful in 3m21s
CD Pipeline / post-deploy-checks (push) Successful in 1m16s

This commit is contained in:
Your Name
2026-05-14 18:57:03 +08:00
parent 1670ff1960
commit 13cf02b740
7 changed files with 334 additions and 9 deletions

View File

@@ -76,13 +76,13 @@ from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
from src.api.v1 import timeline as timeline_v1
from src.api.v1 import webhooks as webhooks_v1
from src.core.config import settings
from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 feature flags 啟動驗證
from src.core.http_client import close_all_http_clients, init_all_http_clients
from src.core.logging import get_logger, setup_logging
from src.core.redis_client import close_redis_pool, init_redis_pool
from src.services.flywheel_stats_service import get_flywheel_stats_service
from src.core.sse import get_publisher
from src.core.telemetry import setup_telemetry, shutdown_telemetry
from src.services.adr100_slo_metrics_service import get_adr100_slo_metrics_service
from src.services.flywheel_stats_service import get_flywheel_stats_service
# CTO-201: Database & Executor
from src.db.base import close_db, init_db
@@ -554,7 +554,6 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
# 2026-04-27 P3.1-T3 by Claude
try:
from src.utils.timezone import now_taipei
from datetime import datetime as _dt
async def _run_kb_rot_cleaner_loop() -> None:
from src.jobs.kb_rot_cleaner import get_kb_rot_cleaner
@@ -1016,6 +1015,13 @@ async def prometheus_metrics() -> Response:
content += flywheel_metrics.to_prometheus_lines()
except Exception:
logger.warning("prometheus_metrics_flywheel_error")
# 2026-05-14 Codex — T18 ADR-100 SLO emitter
# GovernanceAgent 讀 Prometheus recording rules若 /metrics 不吐底層 DB totals
# sli:* rules 會全空並每小時重複發 governance_slo_data_gap。
try:
content += await get_adr100_slo_metrics_service().to_prometheus_lines()
except Exception as exc:
logger.warning("prometheus_metrics_adr100_slo_error", error=str(exc))
return Response(content=content, media_type=CONTENT_TYPE_LATEST)

View File

@@ -0,0 +1,217 @@
"""
ADR-100 SLO metrics emitter.
Prometheus recording rules for the AI flywheel SLOs expect a small set of
counter-like metrics. The source of truth already lives in PostgreSQL, so this
read-side emitter exposes DB totals on /metrics without changing runtime write
paths or introducing another state store.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from time import time
from sqlalchemy import text
from src.db.base import get_db_context
@dataclass(frozen=True)
class AutomationOperationSample:
outcome: str
operation_type: str
count: int
@dataclass(frozen=True)
class VerificationSample:
outcome: str
count: int
@dataclass(frozen=True)
class Adr100SloMetricsSnapshot:
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
post_execution_verifications: list[VerificationSample] = field(default_factory=list)
knowledge_entries_total: int = 0
high_confidence_total: int = 0
high_confidence_success_total: int = 0
emitted_at: float = field(default_factory=time)
class Adr100SloMetricsService:
"""Build ADR-100 Prometheus samples from production DB state."""
async def to_prometheus_lines(self) -> str:
snapshot = await self.fetch_snapshot()
return render_adr100_slo_metrics(snapshot)
async def fetch_snapshot(self) -> Adr100SloMetricsSnapshot:
async with get_db_context() as db:
automation_rows = (
await db.execute(text(_AUTOMATION_OPERATION_SQL))
).fetchall()
verification_rows = (
await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
).fetchall()
knowledge_total = int(
(await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
or 0
)
confidence_row = (
await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
).one()
return Adr100SloMetricsSnapshot(
automation_operations=[
AutomationOperationSample(
outcome=str(row.outcome),
operation_type=str(row.operation_type),
count=int(row.count or 0),
)
for row in automation_rows
],
post_execution_verifications=[
VerificationSample(
outcome=str(row.outcome),
count=int(row.count or 0),
)
for row in verification_rows
],
knowledge_entries_total=knowledge_total,
high_confidence_total=int(confidence_row.high_confidence_total or 0),
high_confidence_success_total=int(
confidence_row.high_confidence_success_total or 0
),
)
def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
"""Render ADR-100 SLO metrics in Prometheus text exposition format."""
lines: list[str] = [
"",
"# HELP automation_operation_log_total DB-derived AI automation operation count for ADR-100 SLOs",
"# TYPE automation_operation_log_total counter",
]
if snapshot.automation_operations:
for sample in snapshot.automation_operations:
lines.append(
"automation_operation_log_total"
f'{{outcome="{_escape_label(sample.outcome)}",'
f'operation_type="{_escape_label(sample.operation_type)}"}} '
f"{sample.count}"
)
else:
lines.append(
'automation_operation_log_total{outcome="none",operation_type="none"} 0'
)
lines.extend([
"# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
"# TYPE post_execution_verification_total counter",
])
if snapshot.post_execution_verifications:
for sample in snapshot.post_execution_verifications:
lines.append(
"post_execution_verification_total"
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
)
else:
lines.append('post_execution_verification_total{outcome="none"} 0')
lines.extend([
"# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
"# TYPE knowledge_entries_total counter",
f"knowledge_entries_total {snapshot.knowledge_entries_total}",
"# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
"# TYPE approval_records_high_confidence_total counter",
f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
"# HELP approval_records_high_confidence_success_total DB-derived high confidence approval decisions with successful verification for ADR-100 SLOs",
"# TYPE approval_records_high_confidence_success_total counter",
(
"approval_records_high_confidence_success_total "
f"{snapshot.high_confidence_success_total}"
),
"# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp",
"# TYPE adr100_slo_emitter_last_success_timestamp gauge",
f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}",
"",
])
return "\n".join(lines)
def _escape_label(value: str) -> str:
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
_AUTOMATION_OPERATION_SQL = """
SELECT
CASE
WHEN status <> 'success' THEN status
WHEN actor = 'approval_execution'
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
THEN 'human_required'
ELSE 'auto_executed'
END AS outcome,
operation_type,
count(*) AS count
FROM automation_operation_log
GROUP BY outcome, operation_type
ORDER BY outcome, operation_type
"""
_POST_EXECUTION_VERIFICATION_SQL = """
SELECT verification_result AS outcome, count(*) AS count
FROM incident_evidence
WHERE verification_result IS NOT NULL
GROUP BY verification_result
ORDER BY verification_result
"""
_HIGH_CONFIDENCE_APPROVAL_SQL = """
WITH approval_confidence AS (
SELECT
id,
incident_id,
COALESCE(
CASE
WHEN extra_metadata->>'confidence_score' ~ '^[0-9]+(\\.[0-9]+)?$'
THEN (extra_metadata->>'confidence_score')::numeric
ELSE NULL
END,
CASE
WHEN extra_metadata->>'confidence' ~ '^[0-9]+(\\.[0-9]+)?$'
THEN (extra_metadata->>'confidence')::numeric
ELSE NULL
END,
composite_score,
0
) AS confidence
FROM approval_records
)
SELECT
count(*) FILTER (WHERE confidence >= 0.8) AS high_confidence_total,
count(*) FILTER (
WHERE confidence >= 0.8
AND EXISTS (
SELECT 1
FROM incident_evidence ev
WHERE ev.incident_id = approval_confidence.incident_id
AND ev.verification_result = 'success'
)
) AS high_confidence_success_total
FROM approval_confidence
"""
_adr100_slo_metrics_service: Adr100SloMetricsService | None = None
def get_adr100_slo_metrics_service() -> Adr100SloMetricsService:
global _adr100_slo_metrics_service
if _adr100_slo_metrics_service is None:
_adr100_slo_metrics_service = Adr100SloMetricsService()
return _adr100_slo_metrics_service

View File

@@ -447,13 +447,13 @@ class GovernanceAgent:
"status": "skipped",
"error": "no_data",
"reason": "prometheus_empty_result_metric_not_emitted",
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
"hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載",
}
logger.warning(
"governance_slo_no_data",
slo=name,
query=query,
hint="ADR-100 emitter not yet implemented",
hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
)
continue
value = float(result_list[0]["value"][1])
@@ -655,15 +655,15 @@ class GovernanceAgent:
},
"remediation": {
"items": [
"補齊 ADR-100 SLO emitterautomation_operation_log_total / post_execution_verification_total / km_entries_total",
"設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄(如 emptyDir",
"補齊 ADR-100 SLO emitterautomation_operation_log_total / post_execution_verification_total / knowledge_entries_total",
"確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫",
],
"next_action": "run_adr100_slo_emit_playbook",
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
"hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
},
"actionable": {
"items": [
"先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
],
},

View File

@@ -0,0 +1,71 @@
from src.services.adr100_slo_metrics_service import (
Adr100SloMetricsSnapshot,
AutomationOperationSample,
VerificationSample,
render_adr100_slo_metrics,
)
def test_render_adr100_slo_metrics_outputs_required_series() -> None:
snapshot = Adr100SloMetricsSnapshot(
automation_operations=[
AutomationOperationSample(
outcome="auto_executed",
operation_type="playbook_executed",
count=8,
),
AutomationOperationSample(
outcome="human_required",
operation_type="playbook_executed",
count=2,
),
],
post_execution_verifications=[
VerificationSample(outcome="success", count=7),
VerificationSample(outcome="failed", count=1),
],
knowledge_entries_total=2161,
high_confidence_total=9,
high_confidence_success_total=7,
emitted_at=1_778_756_000,
)
rendered = render_adr100_slo_metrics(snapshot)
assert (
'automation_operation_log_total{outcome="auto_executed",'
'operation_type="playbook_executed"} 8'
) in rendered
assert 'post_execution_verification_total{outcome="success"} 7' in rendered
assert "knowledge_entries_total 2161" in rendered
assert "approval_records_high_confidence_total 9" in rendered
assert "approval_records_high_confidence_success_total 7" in rendered
assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
rendered = render_adr100_slo_metrics(
Adr100SloMetricsSnapshot(emitted_at=1_778_756_000),
)
assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered
assert 'post_execution_verification_total{outcome="none"} 0' in rendered
assert "knowledge_entries_total 0" in rendered
def test_render_adr100_slo_metrics_escapes_labels() -> None:
rendered = render_adr100_slo_metrics(
Adr100SloMetricsSnapshot(
automation_operations=[
AutomationOperationSample(
outcome='auto"executed',
operation_type="line\nbreak",
count=1,
),
],
emitted_at=1_778_756_000,
),
)
assert 'outcome="auto\\"executed"' in rendered
assert 'operation_type="line\\nbreak"' in rendered

View File

@@ -1,3 +1,25 @@
## 2026-05-14 | T18 ADR-100 SLO emitter 接入,治理資料缺口告警轉為可驗證指標
**背景**Telegram 反覆出現「AI 治理警報SLO 資料缺口」,但訊息只能說 `all_slo_metrics_not_emitted`,無法讓 Operator 判斷是 Pod 掛載、Prometheus rule、還是 emitter 本身缺失。Production 查核確認 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR``emptyDir` 掛載,真正缺口是 `/metrics` 沒有輸出 ADR-100 recording rules 所需的底層 series。
**修正**
- 新增 `adr100_slo_metrics_service.py`,從 PostgreSQL 事實來源產出 DB-derived Prometheus 指標:`automation_operation_log_total``post_execution_verification_total``knowledge_entries_total``approval_records_high_confidence_total``approval_records_high_confidence_success_total`
- `/metrics` 追加 ADR-100 SLO emitter不新增 DB schema、不改 Prometheus scrape target讓既有 `awoooi-api` scrape job 可直接取得底層 series。
- `GovernanceAgent` 的 SLO no-data hint 改成 emitter / recording rule / multiprocess mount 三段式,不再把已驗證存在的 `PROMETHEUS_MULTIPROC_DIR` 當成單一原因。
- 清理 `main.py` 兩個既有未使用 import`aiops_flags``_dt`),避免本次觸碰檔案繼續帶 F401 技術債。
**本地驗證**
- `python3 -m py_compile apps/api/src/services/adr100_slo_metrics_service.py apps/api/src/services/governance_agent.py apps/api/src/main.py apps/api/tests/test_adr100_slo_metrics_service.py`pass。
- `pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`47 passed。
- `ruff check --select F,E9 src/services/adr100_slo_metrics_service.py src/services/governance_agent.py src/main.py tests/test_adr100_slo_metrics_service.py`pass。
- `git diff --check`pass。
- Production SQL dry-runautomation / verification / knowledge / high-confidence approval 查詢均可在現有 schema 上執行。
**目前整體進度**
- Alertmanager 低風險自動修復主線:約 96%。
- 完整 AI 自動化管理產品化:約 78%。
- T18 正在推版;推版後需等 Prometheus scrape / recording rule evaluation再確認 `sli:*` 不再全空,並觀察 `governance_slo_data_gap` 是否停止重複推播。
## 2026-05-14 | T17b 治理事件 / dispatch API 查詢修復,解除前端工作鏈路紅燈
**背景**T17A production smoke 顯示 `/awooop/work-items` 可見治理 dispatch 阻塞,但 API 層本身仍有兩個紅燈:`GET /api/v1/ai/governance/events?...` 回 500`GET /api/v1/ai/governance/queue?dispatch_status=pending``table_pending=true`。統帥要求前端要能呈現完整流程,不能讓治理告警與 dispatch 階段停在 API 黑盒。

View File

@@ -158,6 +158,8 @@ increase(knowledge_entries_total[24h])
| `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 |
| `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard |
| `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 |
| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series |
| `apps/api/src/main.py` `/metrics` | 2026-05-14 T18追加 DB-derived SLO emitter讓既有 `awoooi-api` scrape job 取得底層 series |
## 決策理由

View File

@@ -2125,6 +2125,13 @@ Phase 6 完成後
- Production deploy`08d28dc4` 與 enum cast hotfix `6220f522` 已推 Gitea mainCode Review runs `2151` / `2153` successCD runs `2150` / `2152` success最新 deploy marker `9b32d3a9 chore(cd): deploy 6220f52 [skip ci]`API / Worker / Web image 均為 `6220f5226693330a378f363202bd79065ab7fc34``governance/events` 200、`governance/queue` 200 且 `table_pending=false``/zh-TW/awooop/work-items` 200。
- 目前進度更新Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 76%。下一段收斂 governance dispatcher skipped reason / leader-dedupe / ADR-100 SLO emitter並把治理 dispatch 階段完整呈現在 Operator Console。
**T18 ADR-100 SLO emitter 接入2026-05-14 台北)**
- 觸發:治理告警 `governance_slo_data_gap` 反覆推 Telegram但 production 查核顯示 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR``emptyDir`,真正缺口是 `/metrics` 未輸出 ADR-100 recording rules 所需底層 series導致 `sli:*` 全部 empty result。
- 修正:新增 DB-derived `/metrics` emitter`automation_operation_log``incident_evidence``knowledge_entries``approval_records` 暴露 `automation_operation_log_total``post_execution_verification_total``knowledge_entries_total``approval_records_high_confidence_total``approval_records_high_confidence_success_total`;不新增 schema、不改 scrape target。
- 訊息治理:`GovernanceAgent` no-data hint 改為 emitter / recording rule / multiprocess mount 三段式,避免 Operator 被誤導成只有 `PROMETHEUS_MULTIPROC_DIR` 未設。
- 驗證:`py_compile` pass`pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q` 47 passedruff F/E9 passdiff check passproduction SQL dry-run 通過。
- 目前進度更新Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 78%。推版後需等 Prometheus scrape / recording rule evaluation再確認 `sli:*` 不再全空。
---
### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護commit de2d34d