diff --git a/apps/api/src/main.py b/apps/api/src/main.py index d20bb752..02210dc3 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -76,13 +76,13 @@ from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE from src.api.v1 import timeline as timeline_v1 from src.api.v1 import webhooks as webhooks_v1 from src.core.config import settings -from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 feature flags 啟動驗證 from src.core.http_client import close_all_http_clients, init_all_http_clients from src.core.logging import get_logger, setup_logging from src.core.redis_client import close_redis_pool, init_redis_pool -from src.services.flywheel_stats_service import get_flywheel_stats_service from src.core.sse import get_publisher from src.core.telemetry import setup_telemetry, shutdown_telemetry +from src.services.adr100_slo_metrics_service import get_adr100_slo_metrics_service +from src.services.flywheel_stats_service import get_flywheel_stats_service # CTO-201: Database & Executor from src.db.base import close_db, init_db @@ -554,7 +554,6 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: # 2026-04-27 P3.1-T3 by Claude try: from src.utils.timezone import now_taipei - from datetime import datetime as _dt async def _run_kb_rot_cleaner_loop() -> None: from src.jobs.kb_rot_cleaner import get_kb_rot_cleaner @@ -1016,6 +1015,13 @@ async def prometheus_metrics() -> Response: content += flywheel_metrics.to_prometheus_lines() except Exception: logger.warning("prometheus_metrics_flywheel_error") + # 2026-05-14 Codex — T18 ADR-100 SLO emitter + # GovernanceAgent 讀 Prometheus recording rules;若 /metrics 不吐底層 DB totals, + # sli:* rules 會全空並每小時重複發 governance_slo_data_gap。 + try: + content += await get_adr100_slo_metrics_service().to_prometheus_lines() + except Exception as exc: + logger.warning("prometheus_metrics_adr100_slo_error", error=str(exc)) return Response(content=content, media_type=CONTENT_TYPE_LATEST) diff --git a/apps/api/src/services/adr100_slo_metrics_service.py b/apps/api/src/services/adr100_slo_metrics_service.py new file mode 100644 index 00000000..c0dbef9c --- /dev/null +++ b/apps/api/src/services/adr100_slo_metrics_service.py @@ -0,0 +1,217 @@ +""" +ADR-100 SLO metrics emitter. + +Prometheus recording rules for the AI flywheel SLOs expect a small set of +counter-like metrics. The source of truth already lives in PostgreSQL, so this +read-side emitter exposes DB totals on /metrics without changing runtime write +paths or introducing another state store. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from time import time + +from sqlalchemy import text + +from src.db.base import get_db_context + + +@dataclass(frozen=True) +class AutomationOperationSample: + outcome: str + operation_type: str + count: int + + +@dataclass(frozen=True) +class VerificationSample: + outcome: str + count: int + + +@dataclass(frozen=True) +class Adr100SloMetricsSnapshot: + automation_operations: list[AutomationOperationSample] = field(default_factory=list) + post_execution_verifications: list[VerificationSample] = field(default_factory=list) + knowledge_entries_total: int = 0 + high_confidence_total: int = 0 + high_confidence_success_total: int = 0 + emitted_at: float = field(default_factory=time) + + +class Adr100SloMetricsService: + """Build ADR-100 Prometheus samples from production DB state.""" + + async def to_prometheus_lines(self) -> str: + snapshot = await self.fetch_snapshot() + return render_adr100_slo_metrics(snapshot) + + async def fetch_snapshot(self) -> Adr100SloMetricsSnapshot: + async with get_db_context() as db: + automation_rows = ( + await db.execute(text(_AUTOMATION_OPERATION_SQL)) + ).fetchall() + verification_rows = ( + await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL)) + ).fetchall() + knowledge_total = int( + (await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar() + or 0 + ) + confidence_row = ( + await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL)) + ).one() + + return Adr100SloMetricsSnapshot( + automation_operations=[ + AutomationOperationSample( + outcome=str(row.outcome), + operation_type=str(row.operation_type), + count=int(row.count or 0), + ) + for row in automation_rows + ], + post_execution_verifications=[ + VerificationSample( + outcome=str(row.outcome), + count=int(row.count or 0), + ) + for row in verification_rows + ], + knowledge_entries_total=knowledge_total, + high_confidence_total=int(confidence_row.high_confidence_total or 0), + high_confidence_success_total=int( + confidence_row.high_confidence_success_total or 0 + ), + ) + + +def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str: + """Render ADR-100 SLO metrics in Prometheus text exposition format.""" + lines: list[str] = [ + "", + "# HELP automation_operation_log_total DB-derived AI automation operation count for ADR-100 SLOs", + "# TYPE automation_operation_log_total counter", + ] + if snapshot.automation_operations: + for sample in snapshot.automation_operations: + lines.append( + "automation_operation_log_total" + f'{{outcome="{_escape_label(sample.outcome)}",' + f'operation_type="{_escape_label(sample.operation_type)}"}} ' + f"{sample.count}" + ) + else: + lines.append( + 'automation_operation_log_total{outcome="none",operation_type="none"} 0' + ) + + lines.extend([ + "# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs", + "# TYPE post_execution_verification_total counter", + ]) + if snapshot.post_execution_verifications: + for sample in snapshot.post_execution_verifications: + lines.append( + "post_execution_verification_total" + f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}' + ) + else: + lines.append('post_execution_verification_total{outcome="none"} 0') + + lines.extend([ + "# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs", + "# TYPE knowledge_entries_total counter", + f"knowledge_entries_total {snapshot.knowledge_entries_total}", + "# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs", + "# TYPE approval_records_high_confidence_total counter", + f"approval_records_high_confidence_total {snapshot.high_confidence_total}", + "# HELP approval_records_high_confidence_success_total DB-derived high confidence approval decisions with successful verification for ADR-100 SLOs", + "# TYPE approval_records_high_confidence_success_total counter", + ( + "approval_records_high_confidence_success_total " + f"{snapshot.high_confidence_success_total}" + ), + "# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp", + "# TYPE adr100_slo_emitter_last_success_timestamp gauge", + f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}", + "", + ]) + return "\n".join(lines) + + +def _escape_label(value: str) -> str: + return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"') + + +_AUTOMATION_OPERATION_SQL = """ + SELECT + CASE + WHEN status <> 'success' THEN status + WHEN actor = 'approval_execution' + AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%' + THEN 'human_required' + ELSE 'auto_executed' + END AS outcome, + operation_type, + count(*) AS count + FROM automation_operation_log + GROUP BY outcome, operation_type + ORDER BY outcome, operation_type +""" + + +_POST_EXECUTION_VERIFICATION_SQL = """ + SELECT verification_result AS outcome, count(*) AS count + FROM incident_evidence + WHERE verification_result IS NOT NULL + GROUP BY verification_result + ORDER BY verification_result +""" + + +_HIGH_CONFIDENCE_APPROVAL_SQL = """ + WITH approval_confidence AS ( + SELECT + id, + incident_id, + COALESCE( + CASE + WHEN extra_metadata->>'confidence_score' ~ '^[0-9]+(\\.[0-9]+)?$' + THEN (extra_metadata->>'confidence_score')::numeric + ELSE NULL + END, + CASE + WHEN extra_metadata->>'confidence' ~ '^[0-9]+(\\.[0-9]+)?$' + THEN (extra_metadata->>'confidence')::numeric + ELSE NULL + END, + composite_score, + 0 + ) AS confidence + FROM approval_records + ) + SELECT + count(*) FILTER (WHERE confidence >= 0.8) AS high_confidence_total, + count(*) FILTER ( + WHERE confidence >= 0.8 + AND EXISTS ( + SELECT 1 + FROM incident_evidence ev + WHERE ev.incident_id = approval_confidence.incident_id + AND ev.verification_result = 'success' + ) + ) AS high_confidence_success_total + FROM approval_confidence +""" + + +_adr100_slo_metrics_service: Adr100SloMetricsService | None = None + + +def get_adr100_slo_metrics_service() -> Adr100SloMetricsService: + global _adr100_slo_metrics_service + if _adr100_slo_metrics_service is None: + _adr100_slo_metrics_service = Adr100SloMetricsService() + return _adr100_slo_metrics_service diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py index 5241afc0..24ea676e 100644 --- a/apps/api/src/services/governance_agent.py +++ b/apps/api/src/services/governance_agent.py @@ -447,13 +447,13 @@ class GovernanceAgent: "status": "skipped", "error": "no_data", "reason": "prometheus_empty_result_metric_not_emitted", - "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設", + "hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載", } logger.warning( "governance_slo_no_data", slo=name, query=query, - hint="ADR-100 emitter not yet implemented", + hint="ADR-100 metrics, recording rules, or multiprocess mount not ready", ) continue value = float(result_list[0]["value"][1]) @@ -655,15 +655,15 @@ class GovernanceAgent: }, "remediation": { "items": [ - "補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / km_entries_total)", - "設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄(如 emptyDir)", + "補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / knowledge_entries_total)", + "確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫", ], "next_action": "run_adr100_slo_emit_playbook", - "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設", + "hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒", }, "actionable": { "items": [ - "先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載", + "先確認 /metrics 是否已輸出 ADR-100 底層指標", "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則", ], }, diff --git a/apps/api/tests/test_adr100_slo_metrics_service.py b/apps/api/tests/test_adr100_slo_metrics_service.py new file mode 100644 index 00000000..5fa970cf --- /dev/null +++ b/apps/api/tests/test_adr100_slo_metrics_service.py @@ -0,0 +1,71 @@ +from src.services.adr100_slo_metrics_service import ( + Adr100SloMetricsSnapshot, + AutomationOperationSample, + VerificationSample, + render_adr100_slo_metrics, +) + + +def test_render_adr100_slo_metrics_outputs_required_series() -> None: + snapshot = Adr100SloMetricsSnapshot( + automation_operations=[ + AutomationOperationSample( + outcome="auto_executed", + operation_type="playbook_executed", + count=8, + ), + AutomationOperationSample( + outcome="human_required", + operation_type="playbook_executed", + count=2, + ), + ], + post_execution_verifications=[ + VerificationSample(outcome="success", count=7), + VerificationSample(outcome="failed", count=1), + ], + knowledge_entries_total=2161, + high_confidence_total=9, + high_confidence_success_total=7, + emitted_at=1_778_756_000, + ) + + rendered = render_adr100_slo_metrics(snapshot) + + assert ( + 'automation_operation_log_total{outcome="auto_executed",' + 'operation_type="playbook_executed"} 8' + ) in rendered + assert 'post_execution_verification_total{outcome="success"} 7' in rendered + assert "knowledge_entries_total 2161" in rendered + assert "approval_records_high_confidence_total 9" in rendered + assert "approval_records_high_confidence_success_total 7" in rendered + assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered + + +def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None: + rendered = render_adr100_slo_metrics( + Adr100SloMetricsSnapshot(emitted_at=1_778_756_000), + ) + + assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered + assert 'post_execution_verification_total{outcome="none"} 0' in rendered + assert "knowledge_entries_total 0" in rendered + + +def test_render_adr100_slo_metrics_escapes_labels() -> None: + rendered = render_adr100_slo_metrics( + Adr100SloMetricsSnapshot( + automation_operations=[ + AutomationOperationSample( + outcome='auto"executed', + operation_type="line\nbreak", + count=1, + ), + ], + emitted_at=1_778_756_000, + ), + ) + + assert 'outcome="auto\\"executed"' in rendered + assert 'operation_type="line\\nbreak"' in rendered diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e206ea07..1b1860a5 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,25 @@ +## 2026-05-14 | T18 ADR-100 SLO emitter 接入,治理資料缺口告警轉為可驗證指標 + +**背景**:Telegram 反覆出現「AI 治理警報|SLO 資料缺口」,但訊息只能說 `all_slo_metrics_not_emitted`,無法讓 Operator 判斷是 Pod 掛載、Prometheus rule、還是 emitter 本身缺失。Production 查核確認 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR` 與 `emptyDir` 掛載,真正缺口是 `/metrics` 沒有輸出 ADR-100 recording rules 所需的底層 series。 + +**修正**: +- 新增 `adr100_slo_metrics_service.py`,從 PostgreSQL 事實來源產出 DB-derived Prometheus 指標:`automation_operation_log_total`、`post_execution_verification_total`、`knowledge_entries_total`、`approval_records_high_confidence_total`、`approval_records_high_confidence_success_total`。 +- `/metrics` 追加 ADR-100 SLO emitter,不新增 DB schema、不改 Prometheus scrape target,讓既有 `awoooi-api` scrape job 可直接取得底層 series。 +- `GovernanceAgent` 的 SLO no-data hint 改成 emitter / recording rule / multiprocess mount 三段式,不再把已驗證存在的 `PROMETHEUS_MULTIPROC_DIR` 當成單一原因。 +- 清理 `main.py` 兩個既有未使用 import(`aiops_flags`、`_dt`),避免本次觸碰檔案繼續帶 F401 技術債。 + +**本地驗證**: +- `python3 -m py_compile apps/api/src/services/adr100_slo_metrics_service.py apps/api/src/services/governance_agent.py apps/api/src/main.py apps/api/tests/test_adr100_slo_metrics_service.py`:pass。 +- `pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`:47 passed。 +- `ruff check --select F,E9 src/services/adr100_slo_metrics_service.py src/services/governance_agent.py src/main.py tests/test_adr100_slo_metrics_service.py`:pass。 +- `git diff --check`:pass。 +- Production SQL dry-run:automation / verification / knowledge / high-confidence approval 查詢均可在現有 schema 上執行。 + +**目前整體進度**: +- Alertmanager 低風險自動修復主線:約 96%。 +- 完整 AI 自動化管理產品化:約 78%。 +- T18 正在推版;推版後需等 Prometheus scrape / recording rule evaluation,再確認 `sli:*` 不再全空,並觀察 `governance_slo_data_gap` 是否停止重複推播。 + ## 2026-05-14 | T17b 治理事件 / dispatch API 查詢修復,解除前端工作鏈路紅燈 **背景**:T17A production smoke 顯示 `/awooop/work-items` 可見治理 dispatch 阻塞,但 API 層本身仍有兩個紅燈:`GET /api/v1/ai/governance/events?...` 回 500,`GET /api/v1/ai/governance/queue?dispatch_status=pending` 回 `table_pending=true`。統帥要求前端要能呈現完整流程,不能讓治理告警與 dispatch 階段停在 API 黑盒。 diff --git a/docs/adr/ADR-100-ai-autonomous-slo.md b/docs/adr/ADR-100-ai-autonomous-slo.md index d4a5c94f..40b90443 100644 --- a/docs/adr/ADR-100-ai-autonomous-slo.md +++ b/docs/adr/ADR-100-ai-autonomous-slo.md @@ -158,6 +158,8 @@ increase(knowledge_entries_total[24h]) | `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 | | `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard | | `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 | +| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18:從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series | +| `apps/api/src/main.py` `/metrics` | 2026-05-14 T18:追加 DB-derived SLO emitter,讓既有 `awoooi-api` scrape job 取得底層 series | ## 決策理由 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index af2154f0..97648155 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2125,6 +2125,13 @@ Phase 6 完成後 - Production deploy:`08d28dc4` 與 enum cast hotfix `6220f522` 已推 Gitea main;Code Review runs `2151` / `2153` success;CD runs `2150` / `2152` success;最新 deploy marker `9b32d3a9 chore(cd): deploy 6220f52 [skip ci]`;API / Worker / Web image 均為 `6220f5226693330a378f363202bd79065ab7fc34`;`governance/events` 200、`governance/queue` 200 且 `table_pending=false`;`/zh-TW/awooop/work-items` 200。 - 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 76%。下一段收斂 governance dispatcher skipped reason / leader-dedupe / ADR-100 SLO emitter,並把治理 dispatch 階段完整呈現在 Operator Console。 +**T18 ADR-100 SLO emitter 接入(2026-05-14 台北)**: +- 觸發:治理告警 `governance_slo_data_gap` 反覆推 Telegram,但 production 查核顯示 API Pod 已有 `PROMETHEUS_MULTIPROC_DIR` 與 `emptyDir`,真正缺口是 `/metrics` 未輸出 ADR-100 recording rules 所需底層 series,導致 `sli:*` 全部 empty result。 +- 修正:新增 DB-derived `/metrics` emitter,從 `automation_operation_log`、`incident_evidence`、`knowledge_entries`、`approval_records` 暴露 `automation_operation_log_total`、`post_execution_verification_total`、`knowledge_entries_total`、`approval_records_high_confidence_total`、`approval_records_high_confidence_success_total`;不新增 schema、不改 scrape target。 +- 訊息治理:`GovernanceAgent` no-data hint 改為 emitter / recording rule / multiprocess mount 三段式,避免 Operator 被誤導成只有 `PROMETHEUS_MULTIPROC_DIR` 未設。 +- 驗證:`py_compile` pass;`pytest tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q` 47 passed;ruff F/E9 pass;diff check pass;production SQL dry-run 通過。 +- 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 78%。推版後需等 Prometheus scrape / recording rule evaluation,再確認 `sli:*` 不再全空。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d)