feat(governance): emit adr100 slo metrics
This commit is contained in:
@@ -76,13 +76,13 @@ from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
|
||||
from src.api.v1 import timeline as timeline_v1
|
||||
from src.api.v1 import webhooks as webhooks_v1
|
||||
from src.core.config import settings
|
||||
from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 feature flags 啟動驗證
|
||||
from src.core.http_client import close_all_http_clients, init_all_http_clients
|
||||
from src.core.logging import get_logger, setup_logging
|
||||
from src.core.redis_client import close_redis_pool, init_redis_pool
|
||||
from src.services.flywheel_stats_service import get_flywheel_stats_service
|
||||
from src.core.sse import get_publisher
|
||||
from src.core.telemetry import setup_telemetry, shutdown_telemetry
|
||||
from src.services.adr100_slo_metrics_service import get_adr100_slo_metrics_service
|
||||
from src.services.flywheel_stats_service import get_flywheel_stats_service
|
||||
|
||||
# CTO-201: Database & Executor
|
||||
from src.db.base import close_db, init_db
|
||||
@@ -554,7 +554,6 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
# 2026-04-27 P3.1-T3 by Claude
|
||||
try:
|
||||
from src.utils.timezone import now_taipei
|
||||
from datetime import datetime as _dt
|
||||
|
||||
async def _run_kb_rot_cleaner_loop() -> None:
|
||||
from src.jobs.kb_rot_cleaner import get_kb_rot_cleaner
|
||||
@@ -1016,6 +1015,13 @@ async def prometheus_metrics() -> Response:
|
||||
content += flywheel_metrics.to_prometheus_lines()
|
||||
except Exception:
|
||||
logger.warning("prometheus_metrics_flywheel_error")
|
||||
# 2026-05-14 Codex — T18 ADR-100 SLO emitter
|
||||
# GovernanceAgent 讀 Prometheus recording rules;若 /metrics 不吐底層 DB totals,
|
||||
# sli:* rules 會全空並每小時重複發 governance_slo_data_gap。
|
||||
try:
|
||||
content += await get_adr100_slo_metrics_service().to_prometheus_lines()
|
||||
except Exception as exc:
|
||||
logger.warning("prometheus_metrics_adr100_slo_error", error=str(exc))
|
||||
return Response(content=content, media_type=CONTENT_TYPE_LATEST)
|
||||
|
||||
|
||||
|
||||
217
apps/api/src/services/adr100_slo_metrics_service.py
Normal file
217
apps/api/src/services/adr100_slo_metrics_service.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
ADR-100 SLO metrics emitter.
|
||||
|
||||
Prometheus recording rules for the AI flywheel SLOs expect a small set of
|
||||
counter-like metrics. The source of truth already lives in PostgreSQL, so this
|
||||
read-side emitter exposes DB totals on /metrics without changing runtime write
|
||||
paths or introducing another state store.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from time import time
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from src.db.base import get_db_context
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AutomationOperationSample:
|
||||
outcome: str
|
||||
operation_type: str
|
||||
count: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VerificationSample:
|
||||
outcome: str
|
||||
count: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Adr100SloMetricsSnapshot:
|
||||
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
|
||||
post_execution_verifications: list[VerificationSample] = field(default_factory=list)
|
||||
knowledge_entries_total: int = 0
|
||||
high_confidence_total: int = 0
|
||||
high_confidence_success_total: int = 0
|
||||
emitted_at: float = field(default_factory=time)
|
||||
|
||||
|
||||
class Adr100SloMetricsService:
|
||||
"""Build ADR-100 Prometheus samples from production DB state."""
|
||||
|
||||
async def to_prometheus_lines(self) -> str:
|
||||
snapshot = await self.fetch_snapshot()
|
||||
return render_adr100_slo_metrics(snapshot)
|
||||
|
||||
async def fetch_snapshot(self) -> Adr100SloMetricsSnapshot:
|
||||
async with get_db_context() as db:
|
||||
automation_rows = (
|
||||
await db.execute(text(_AUTOMATION_OPERATION_SQL))
|
||||
).fetchall()
|
||||
verification_rows = (
|
||||
await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
|
||||
).fetchall()
|
||||
knowledge_total = int(
|
||||
(await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
|
||||
or 0
|
||||
)
|
||||
confidence_row = (
|
||||
await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
|
||||
).one()
|
||||
|
||||
return Adr100SloMetricsSnapshot(
|
||||
automation_operations=[
|
||||
AutomationOperationSample(
|
||||
outcome=str(row.outcome),
|
||||
operation_type=str(row.operation_type),
|
||||
count=int(row.count or 0),
|
||||
)
|
||||
for row in automation_rows
|
||||
],
|
||||
post_execution_verifications=[
|
||||
VerificationSample(
|
||||
outcome=str(row.outcome),
|
||||
count=int(row.count or 0),
|
||||
)
|
||||
for row in verification_rows
|
||||
],
|
||||
knowledge_entries_total=knowledge_total,
|
||||
high_confidence_total=int(confidence_row.high_confidence_total or 0),
|
||||
high_confidence_success_total=int(
|
||||
confidence_row.high_confidence_success_total or 0
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
|
||||
"""Render ADR-100 SLO metrics in Prometheus text exposition format."""
|
||||
lines: list[str] = [
|
||||
"",
|
||||
"# HELP automation_operation_log_total DB-derived AI automation operation count for ADR-100 SLOs",
|
||||
"# TYPE automation_operation_log_total counter",
|
||||
]
|
||||
if snapshot.automation_operations:
|
||||
for sample in snapshot.automation_operations:
|
||||
lines.append(
|
||||
"automation_operation_log_total"
|
||||
f'{{outcome="{_escape_label(sample.outcome)}",'
|
||||
f'operation_type="{_escape_label(sample.operation_type)}"}} '
|
||||
f"{sample.count}"
|
||||
)
|
||||
else:
|
||||
lines.append(
|
||||
'automation_operation_log_total{outcome="none",operation_type="none"} 0'
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
|
||||
"# TYPE post_execution_verification_total counter",
|
||||
])
|
||||
if snapshot.post_execution_verifications:
|
||||
for sample in snapshot.post_execution_verifications:
|
||||
lines.append(
|
||||
"post_execution_verification_total"
|
||||
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
|
||||
)
|
||||
else:
|
||||
lines.append('post_execution_verification_total{outcome="none"} 0')
|
||||
|
||||
lines.extend([
|
||||
"# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
|
||||
"# TYPE knowledge_entries_total counter",
|
||||
f"knowledge_entries_total {snapshot.knowledge_entries_total}",
|
||||
"# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
|
||||
"# TYPE approval_records_high_confidence_total counter",
|
||||
f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
|
||||
"# HELP approval_records_high_confidence_success_total DB-derived high confidence approval decisions with successful verification for ADR-100 SLOs",
|
||||
"# TYPE approval_records_high_confidence_success_total counter",
|
||||
(
|
||||
"approval_records_high_confidence_success_total "
|
||||
f"{snapshot.high_confidence_success_total}"
|
||||
),
|
||||
"# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp",
|
||||
"# TYPE adr100_slo_emitter_last_success_timestamp gauge",
|
||||
f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}",
|
||||
"",
|
||||
])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _escape_label(value: str) -> str:
|
||||
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
|
||||
|
||||
|
||||
_AUTOMATION_OPERATION_SQL = """
|
||||
SELECT
|
||||
CASE
|
||||
WHEN status <> 'success' THEN status
|
||||
WHEN actor = 'approval_execution'
|
||||
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
|
||||
THEN 'human_required'
|
||||
ELSE 'auto_executed'
|
||||
END AS outcome,
|
||||
operation_type,
|
||||
count(*) AS count
|
||||
FROM automation_operation_log
|
||||
GROUP BY outcome, operation_type
|
||||
ORDER BY outcome, operation_type
|
||||
"""
|
||||
|
||||
|
||||
_POST_EXECUTION_VERIFICATION_SQL = """
|
||||
SELECT verification_result AS outcome, count(*) AS count
|
||||
FROM incident_evidence
|
||||
WHERE verification_result IS NOT NULL
|
||||
GROUP BY verification_result
|
||||
ORDER BY verification_result
|
||||
"""
|
||||
|
||||
|
||||
_HIGH_CONFIDENCE_APPROVAL_SQL = """
|
||||
WITH approval_confidence AS (
|
||||
SELECT
|
||||
id,
|
||||
incident_id,
|
||||
COALESCE(
|
||||
CASE
|
||||
WHEN extra_metadata->>'confidence_score' ~ '^[0-9]+(\\.[0-9]+)?$'
|
||||
THEN (extra_metadata->>'confidence_score')::numeric
|
||||
ELSE NULL
|
||||
END,
|
||||
CASE
|
||||
WHEN extra_metadata->>'confidence' ~ '^[0-9]+(\\.[0-9]+)?$'
|
||||
THEN (extra_metadata->>'confidence')::numeric
|
||||
ELSE NULL
|
||||
END,
|
||||
composite_score,
|
||||
0
|
||||
) AS confidence
|
||||
FROM approval_records
|
||||
)
|
||||
SELECT
|
||||
count(*) FILTER (WHERE confidence >= 0.8) AS high_confidence_total,
|
||||
count(*) FILTER (
|
||||
WHERE confidence >= 0.8
|
||||
AND EXISTS (
|
||||
SELECT 1
|
||||
FROM incident_evidence ev
|
||||
WHERE ev.incident_id = approval_confidence.incident_id
|
||||
AND ev.verification_result = 'success'
|
||||
)
|
||||
) AS high_confidence_success_total
|
||||
FROM approval_confidence
|
||||
"""
|
||||
|
||||
|
||||
_adr100_slo_metrics_service: Adr100SloMetricsService | None = None
|
||||
|
||||
|
||||
def get_adr100_slo_metrics_service() -> Adr100SloMetricsService:
|
||||
global _adr100_slo_metrics_service
|
||||
if _adr100_slo_metrics_service is None:
|
||||
_adr100_slo_metrics_service = Adr100SloMetricsService()
|
||||
return _adr100_slo_metrics_service
|
||||
@@ -447,13 +447,13 @@ class GovernanceAgent:
|
||||
"status": "skipped",
|
||||
"error": "no_data",
|
||||
"reason": "prometheus_empty_result_metric_not_emitted",
|
||||
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
|
||||
"hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載",
|
||||
}
|
||||
logger.warning(
|
||||
"governance_slo_no_data",
|
||||
slo=name,
|
||||
query=query,
|
||||
hint="ADR-100 emitter not yet implemented",
|
||||
hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
|
||||
)
|
||||
continue
|
||||
value = float(result_list[0]["value"][1])
|
||||
@@ -655,15 +655,15 @@ class GovernanceAgent:
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / km_entries_total)",
|
||||
"設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄(如 emptyDir)",
|
||||
"補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / knowledge_entries_total)",
|
||||
"確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫",
|
||||
],
|
||||
"next_action": "run_adr100_slo_emit_playbook",
|
||||
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
|
||||
"hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
|
||||
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
|
||||
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
|
||||
],
|
||||
},
|
||||
|
||||
71
apps/api/tests/test_adr100_slo_metrics_service.py
Normal file
71
apps/api/tests/test_adr100_slo_metrics_service.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from src.services.adr100_slo_metrics_service import (
|
||||
Adr100SloMetricsSnapshot,
|
||||
AutomationOperationSample,
|
||||
VerificationSample,
|
||||
render_adr100_slo_metrics,
|
||||
)
|
||||
|
||||
|
||||
def test_render_adr100_slo_metrics_outputs_required_series() -> None:
|
||||
snapshot = Adr100SloMetricsSnapshot(
|
||||
automation_operations=[
|
||||
AutomationOperationSample(
|
||||
outcome="auto_executed",
|
||||
operation_type="playbook_executed",
|
||||
count=8,
|
||||
),
|
||||
AutomationOperationSample(
|
||||
outcome="human_required",
|
||||
operation_type="playbook_executed",
|
||||
count=2,
|
||||
),
|
||||
],
|
||||
post_execution_verifications=[
|
||||
VerificationSample(outcome="success", count=7),
|
||||
VerificationSample(outcome="failed", count=1),
|
||||
],
|
||||
knowledge_entries_total=2161,
|
||||
high_confidence_total=9,
|
||||
high_confidence_success_total=7,
|
||||
emitted_at=1_778_756_000,
|
||||
)
|
||||
|
||||
rendered = render_adr100_slo_metrics(snapshot)
|
||||
|
||||
assert (
|
||||
'automation_operation_log_total{outcome="auto_executed",'
|
||||
'operation_type="playbook_executed"} 8'
|
||||
) in rendered
|
||||
assert 'post_execution_verification_total{outcome="success"} 7' in rendered
|
||||
assert "knowledge_entries_total 2161" in rendered
|
||||
assert "approval_records_high_confidence_total 9" in rendered
|
||||
assert "approval_records_high_confidence_success_total 7" in rendered
|
||||
assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
|
||||
|
||||
|
||||
def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
|
||||
rendered = render_adr100_slo_metrics(
|
||||
Adr100SloMetricsSnapshot(emitted_at=1_778_756_000),
|
||||
)
|
||||
|
||||
assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered
|
||||
assert 'post_execution_verification_total{outcome="none"} 0' in rendered
|
||||
assert "knowledge_entries_total 0" in rendered
|
||||
|
||||
|
||||
def test_render_adr100_slo_metrics_escapes_labels() -> None:
|
||||
rendered = render_adr100_slo_metrics(
|
||||
Adr100SloMetricsSnapshot(
|
||||
automation_operations=[
|
||||
AutomationOperationSample(
|
||||
outcome='auto"executed',
|
||||
operation_type="line\nbreak",
|
||||
count=1,
|
||||
),
|
||||
],
|
||||
emitted_at=1_778_756_000,
|
||||
),
|
||||
)
|
||||
|
||||
assert 'outcome="auto\\"executed"' in rendered
|
||||
assert 'operation_type="line\\nbreak"' in rendered
|
||||
@@ -1,3 +1,25 @@
|
||||
## 2026-05-14 | T18 ADR-100 SLO emitter 接入,治理資料缺口告警轉為可驗證指標
|
||||
|
||||
**背景**:Telegram 反覆出現「AI 治理警報|SLO 資料缺口」,但訊息只能說 `all_slo_metrics_not_emitted`,無法讓 Operator 判斷是 Pod 掛載、Prometheus rule、還是 emitter 本身缺失。Production 查核確認 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR` 與 `emptyDir` 掛載,真正缺口是 `/metrics` 沒有輸出 ADR-100 recording rules 所需的底層 series。
|
||||
|
||||
**修正**:
|
||||
- 新增 `adr100_slo_metrics_service.py`,從 PostgreSQL 事實來源產出 DB-derived Prometheus 指標:`automation_operation_log_total`、`post_execution_verification_total`、`knowledge_entries_total`、`approval_records_high_confidence_total`、`approval_records_high_confidence_success_total`。
|
||||
- `/metrics` 追加 ADR-100 SLO emitter,不新增 DB schema、不改 Prometheus scrape target,讓既有 `awoooi-api` scrape job 可直接取得底層 series。
|
||||
- `GovernanceAgent` 的 SLO no-data hint 改成 emitter / recording rule / multiprocess mount 三段式,不再把已驗證存在的 `PROMETHEUS_MULTIPROC_DIR` 當成單一原因。
|
||||
- 清理 `main.py` 兩個既有未使用 import(`aiops_flags`、`_dt`),避免本次觸碰檔案繼續帶 F401 技術債。
|
||||
|
||||
**本地驗證**:
|
||||
- `python3 -m py_compile apps/api/src/services/adr100_slo_metrics_service.py apps/api/src/services/governance_agent.py apps/api/src/main.py apps/api/tests/test_adr100_slo_metrics_service.py`:pass。
|
||||
- `pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`:47 passed。
|
||||
- `ruff check --select F,E9 src/services/adr100_slo_metrics_service.py src/services/governance_agent.py src/main.py tests/test_adr100_slo_metrics_service.py`:pass。
|
||||
- `git diff --check`:pass。
|
||||
- Production SQL dry-run:automation / verification / knowledge / high-confidence approval 查詢均可在現有 schema 上執行。
|
||||
|
||||
**目前整體進度**:
|
||||
- Alertmanager 低風險自動修復主線:約 96%。
|
||||
- 完整 AI 自動化管理產品化:約 78%。
|
||||
- T18 正在推版;推版後需等 Prometheus scrape / recording rule evaluation,再確認 `sli:*` 不再全空,並觀察 `governance_slo_data_gap` 是否停止重複推播。
|
||||
|
||||
## 2026-05-14 | T17b 治理事件 / dispatch API 查詢修復,解除前端工作鏈路紅燈
|
||||
|
||||
**背景**:T17A production smoke 顯示 `/awooop/work-items` 可見治理 dispatch 阻塞,但 API 層本身仍有兩個紅燈:`GET /api/v1/ai/governance/events?...` 回 500,`GET /api/v1/ai/governance/queue?dispatch_status=pending` 回 `table_pending=true`。統帥要求前端要能呈現完整流程,不能讓治理告警與 dispatch 階段停在 API 黑盒。
|
||||
|
||||
@@ -158,6 +158,8 @@ increase(knowledge_entries_total[24h])
|
||||
| `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 |
|
||||
| `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard |
|
||||
| `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 |
|
||||
| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18:從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series |
|
||||
| `apps/api/src/main.py` `/metrics` | 2026-05-14 T18:追加 DB-derived SLO emitter,讓既有 `awoooi-api` scrape job 取得底層 series |
|
||||
|
||||
## 決策理由
|
||||
|
||||
|
||||
@@ -2125,6 +2125,13 @@ Phase 6 完成後
|
||||
- Production deploy:`08d28dc4` 與 enum cast hotfix `6220f522` 已推 Gitea main;Code Review runs `2151` / `2153` success;CD runs `2150` / `2152` success;最新 deploy marker `9b32d3a9 chore(cd): deploy 6220f52 [skip ci]`;API / Worker / Web image 均為 `6220f5226693330a378f363202bd79065ab7fc34`;`governance/events` 200、`governance/queue` 200 且 `table_pending=false`;`/zh-TW/awooop/work-items` 200。
|
||||
- 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 76%。下一段收斂 governance dispatcher skipped reason / leader-dedupe / ADR-100 SLO emitter,並把治理 dispatch 階段完整呈現在 Operator Console。
|
||||
|
||||
**T18 ADR-100 SLO emitter 接入(2026-05-14 台北)**:
|
||||
- 觸發:治理告警 `governance_slo_data_gap` 反覆推 Telegram,但 production 查核顯示 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR` 與 `emptyDir`,真正缺口是 `/metrics` 未輸出 ADR-100 recording rules 所需底層 series,導致 `sli:*` 全部 empty result。
|
||||
- 修正:新增 DB-derived `/metrics` emitter,從 `automation_operation_log`、`incident_evidence`、`knowledge_entries`、`approval_records` 暴露 `automation_operation_log_total`、`post_execution_verification_total`、`knowledge_entries_total`、`approval_records_high_confidence_total`、`approval_records_high_confidence_success_total`;不新增 schema、不改 scrape target。
|
||||
- 訊息治理:`GovernanceAgent` no-data hint 改為 emitter / recording rule / multiprocess mount 三段式,避免 Operator 被誤導成只有 `PROMETHEUS_MULTIPROC_DIR` 未設。
|
||||
- 驗證:`py_compile` pass;`pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q` 47 passed;ruff F/E9 pass;diff check pass;production SQL dry-run 通過。
|
||||
- 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 78%。推版後需等 Prometheus scrape / recording rule evaluation,再確認 `sli:*` 不再全空。
|
||||
|
||||
---
|
||||
|
||||
### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d)
|
||||
|
||||
Reference in New Issue
Block a user