diff --git a/apps/api/src/api/v1/ai_slo.py b/apps/api/src/api/v1/ai_slo.py new file mode 100644 index 00000000..8ce200c8 --- /dev/null +++ b/apps/api/src/api/v1/ai_slo.py @@ -0,0 +1,58 @@ +""" +AI SLO REST API +=============== +ADR-087 Phase 6 自我治理閉環 — AI 決策品質 SLO 查詢端點 + +Endpoints: + GET /api/v1/ai/slo — 取得最新 SLO 計算結果(含 Redis 快取) + +設計原則: +- 優先讀 Service 層快取(TTL 5min),快取失效才重算 +- 計算失敗 → 保守回傳 any_violated=True(由 AiSloCalculator 處理) +- 強制重算:?force_refresh=true +- Router 層不直接存取 Redis(leWOOOgo 積木化鐵律) + +2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立 +""" + +from __future__ import annotations + +import structlog +from fastapi import APIRouter, Query + +from src.services.ai_slo_calculator import AiSloCalculator + +logger = structlog.get_logger(__name__) + +router = APIRouter() + + +@router.get("/ai/slo") +async def get_ai_slo( + force_refresh: bool = Query(False, description="忽略快取,強制重算"), +) -> dict: + """ + 取得 AI 決策品質 SLO 最新結果。 + + 優先讀 Redis 快取(TTL 5min);force_refresh=true 則強制重算並更新快取。 + + Response: + calculated_at ISO 時間戳 + window_days 計算視窗(天) + any_violated 是否有任何 SLO 違反 + cache_hit 是否命中快取 + metrics[] 三大 SLO 指標明細 + """ + calc = AiSloCalculator() + + if not force_refresh: + cached = await calc.get_cached_report() + if cached: + data = cached.to_dict() + data["cache_hit"] = True + return data + + report = await calc.run() + data = report.to_dict() + data["cache_hit"] = False + return data diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 47aed008..be36be1c 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -34,6 +34,7 @@ from sentry_sdk.integrations.starlette import StarletteIntegration from src.api.v1 import agents as agents_v1 # Phase 9.5: Agent Teams API from src.api.v1 import ai as ai_v1 +from src.api.v1 import ai_slo as ai_slo_v1 # Phase 6 ADR-087: AI SLO 自我治理 from src.api.v1 import approvals as approvals_v1 from src.api.v1 import alert_operation_logs as alert_operation_logs_v1 from src.api.v1 import audit_logs as audit_logs_v1 @@ -514,6 +515,7 @@ app.include_router(csrf_v1.router, prefix="/api/v1", tags=["Security"]) # Phase app.include_router(dashboard_v1.router, prefix="/api/v1", tags=["Dashboard"]) app.include_router(approvals_v1.router, prefix="/api/v1", tags=["HITL Approvals"]) app.include_router(ai_v1.router, prefix="/api/v1", tags=["AI Decision"]) +app.include_router(ai_slo_v1.router, prefix="/api/v1", tags=["AI SLO"]) # Phase 6 ADR-087 app.include_router(webhooks_v1.router, prefix="/api/v1", tags=["Webhooks"]) app.include_router(timeline_v1.router, prefix="/api/v1", tags=["Timeline"]) app.include_router(audit_logs_v1.router, prefix="/api/v1", tags=["Audit Logs"]) diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 5bc33eda..5020ea6c 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1484,3 +1484,48 @@ Phase 6 完成後 - `AIOPS_P3_EVOLVER_ENABLED` **下一步:** ADR-083 草稿 → Gate 3 架構審查 → Phase 3 commit push Gitea + +--- + +### 2026-04-15 深夜 (台北) — P0 告警靜默根治 + Phase 6 自我治理閉環收官 + +**P0 告警靜默 RCA(3 根因):** + +| # | 根因 | 影響 | 修復 | +|---|------|------|------| +| 1 | `approval_db.py:find_by_fingerprint()` PENDING 無 TTL | 舊 PENDING 記錄(hit_count=77/30/17)永久吸收相同 fingerprint 告警,Telegram 完全靜默 | 加 `PENDING_TTL_HOURS=24` 時限;kubectl 直接過期 7 筆殭屍記錄 | +| 2 | `create_approval_with_fingerprint()` expires_at=NULL | 新建 ApprovalRecord 永遠不過期,自動過期邏輯形同虛設 | 加 `DEFAULT_APPROVAL_TTL_HOURS=48` 預設值 | +| 3 | `openclaw.py:897` DIAGNOSE require_local=True | v4.3 早已決定 NIM 為主力但未更新,所有 DIAGNOSE 請求 privacy_skip → LLM 無聲失敗 | 移除 DIAGNOSE 出 require_local 條件 | + +**P2 飛輪斷鏈修復:** +- 新建 `jobs/approval_timeout_resolver.py`:每小時掃描逾期 PENDING,標記 EXPIRED + 呼叫 `resolve_incident(resolution_type="timeout")` +- `anomaly_counter.py` 新增 `timeout_ignored` disposition(隱式負向回饋) +- `incident_service.py.resolve_incident()` 新增 `resolution_type` 參數 + +**asyncpg CrashLoopBackOff 修復:** +- `db/base.py` Phase 6 migration 三條 CREATE INDEX 拆為獨立 execute(asyncpg 不允許 prepared statement 多指令) + +**Phase 6 自我治理閉環 — 全部完成:** + +| 元件 | 檔案 | 說明 | +|------|------|------| +| AI SLO 計算器 | `services/ai_slo_calculator.py` | 三大 SLO(成功率/推翻率/false neg rate)7d 滾動,Redis 快取 5min | +| Trust Drift 偵測器 | `services/trust_drift_detector.py` | 偵測 optimism_bias / confidence_collapse,寫 ai_governance_events | +| KB Rot 清理 Job | `jobs/kb_rot_cleaner.py` | ROT-1 Kubernetes API 版本/ROT-2 Prometheus 指標/ROT-3 90d 陳舊 case | +| 自我降級引擎 | `services/decision_manager.py` | SLO 違反時自動升高 confidence_threshold,啟動降級保護 | +| SLO REST API | `api/v1/ai_slo.py` | GET /api/v1/ai/slo(含 force_refresh 參數) | +| DB 表 + Migration | `db/models.py` + `db/base.py` | AiGovernanceEvent 不可變 Event Sourcing 表 + 3 個 index | + +**附帶修復:** +- `main.py` 停用 Telegram 心跳監控(已轉發到另一群組) +- `ai_router.py` 失敗通知改為 ADR-075 TYPE-1 格式(禁止 raw text) + +**Phase 6 退出條件(§7.1):** +- [x] SLO 計算器可回傳三大指標 +- [x] Trust drift 偵測器可寫 ai_governance_events +- [x] KB rot 清理 job 可運行 +- [x] decision_manager 自我降級邏輯掛載 +- [x] REST API `/api/v1/ai/slo` 可查詢 +- [ ] 生產驗證(等 3ce5025 / Phase 6 image 部署後觀察) + +**commit chain:** fab65e7 → f31b4e3 → f045506 → f9ba200 → 3ce5025 → (Phase 6 REST API commit)