diff --git a/apps/api/src/api/v1/platform/operator_runs.py b/apps/api/src/api/v1/platform/operator_runs.py index 43586bd6..fe0826d9 100644 --- a/apps/api/src/api/v1/platform/operator_runs.py +++ b/apps/api/src/api/v1/platform/operator_runs.py @@ -108,8 +108,9 @@ async def list_runs( ) async def list_approvals( project_id: str | None = Query(None, description="租戶 ID(可選)"), + run_id: str | None = Query(None, description="Run ID(可選,M8 詳情頁查單筆)"), ) -> dict[str, Any]: - return await list_approvals_svc(project_id=project_id) + return await list_approvals_svc(project_id=project_id, run_id=run_id) @router.post( diff --git a/apps/api/src/repositories/metrics_repository.py b/apps/api/src/repositories/metrics_repository.py index 03099329..3108b1a4 100644 --- a/apps/api/src/repositories/metrics_repository.py +++ b/apps/api/src/repositories/metrics_repository.py @@ -60,13 +60,17 @@ class MetricsDBRepository(IMetricsRepository): cutoff = datetime.now(UTC) - timedelta(hours=hours) # Query: 統計 executed vs total (approved + executed + execution_failed) + # 2026-05-06 ogt + Codex: + # approval_records.status 目前實際寫入的是大寫 enum + # (APPROVED / EXECUTION_SUCCESS / EXECUTION_FAILED)。舊查詢只看 + # lowercase executed,導致 AI Success 在報表層永遠趨近 0。 query = text(""" SELECT - COUNT(CASE WHEN status = 'executed' THEN 1 END) as executed_count, + COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) as executed_count, COUNT(*) as total_count FROM approval_records WHERE created_at >= :cutoff - AND status IN ('approved', 'executed', 'execution_failed') + AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED') """) result = await session.execute(query, {"cutoff": cutoff}) @@ -127,11 +131,11 @@ class MetricsDBRepository(IMetricsRepository): trend_query = text(""" SELECT date_trunc('hour', created_at) as hour_bucket, - COUNT(CASE WHEN status = 'executed' THEN 1 END) * 100.0 / + COUNT(CASE WHEN UPPER(status::text) = 'EXECUTION_SUCCESS' THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0) as hourly_rate FROM approval_records WHERE created_at >= :cutoff - AND status IN ('approved', 'executed', 'execution_failed') + AND UPPER(status::text) IN ('APPROVED', 'EXECUTION_SUCCESS', 'EXECUTION_FAILED') GROUP BY hour_bucket ORDER BY hour_bucket DESC LIMIT :limit diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index fb44ecbd..11a08f24 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -457,6 +457,8 @@ class AutoRepairService: except Exception as _db_e: logger.error("auto_repair_db_write_failed", error=str(_db_e)) + self._record_auto_repair_metric(playbook, success=True) + # 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型 # P0-1 Fix: 統一使用 AnomalyCounter.hash_signature() try: @@ -630,6 +632,8 @@ class AutoRepairService: except Exception as _db_e: logger.error("auto_repair_db_write_failed", error=str(_db_e)) + self._record_auto_repair_metric(playbook, success=False) + # 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN # 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化) try: @@ -700,6 +704,35 @@ class AutoRepairService: return max_risk + def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None: + """把實際 auto-repair 執行寫入 Prometheus 指標。 + + 2026-05-06 ogt + Codex:DB 已有 auto_repair_executions,但 + core.metrics.record_auto_repair() 長期零 caller,導致治理/心跳用 + Prometheus 看起來像「飛輪沒做事」。label 使用 action_type,避免 + playbook_id 造成高基數。 + """ + try: + from src.core.metrics import record_auto_repair + + first_step = playbook.repair_steps[0] if playbook.repair_steps else None + action = first_step.action_type.value if first_step else "unknown" + max_risk = self._get_max_risk_level(playbook) + tier = { + RiskLevel.LOW: 1, + RiskLevel.MEDIUM: 2, + RiskLevel.HIGH: 3, + RiskLevel.CRITICAL: 4, + }.get(max_risk, 0) + record_auto_repair(action=action, tier=tier, success=success) + except Exception as e: + logger.warning( + "auto_repair_metric_record_failed", + playbook_id=playbook.playbook_id, + success=success, + error=str(e), + ) + def _is_host_or_backup_incident(self, incident: Incident) -> bool: """主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。""" diff --git a/apps/api/src/services/flywheel_stats_service.py b/apps/api/src/services/flywheel_stats_service.py index 1ea75078..1d51d471 100644 --- a/apps/api/src/services/flywheel_stats_service.py +++ b/apps/api/src/services/flywheel_stats_service.py @@ -237,6 +237,31 @@ class FlywheelStatsService: except (json.JSONDecodeError, KeyError): continue + # 2026-05-06 ogt + Codex: + # 執行成功率的 source of truth 是 auto_repair_executions。 + # Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後, + # 造成 governance / heartbeat 判定「飛輪沒有執行」。 + try: + async with get_db_context() as db: + row = await db.execute( + text(""" + SELECT + COUNT(*) FILTER (WHERE success IS TRUE) AS success, + COUNT(*) AS total + FROM auto_repair_executions + WHERE created_at >= NOW() - interval '24 hours' + """) + ) + repair_stats = row.one() + db_total_exec = int(repair_stats.total or 0) + if db_total_exec >= FLYWHEEL_MIN_SAMPLE: + db_total_success = int(repair_stats.success or 0) + return count, db_total_success / db_total_exec + if db_total_exec > 0: + return count, None + except Exception: + logger.warning("flywheel_stats_auto_repair_execution_query_failed") + if total_exec < FLYWHEEL_MIN_SAMPLE: # 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷 return count, None diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py index 4cc58ad8..106e6f1e 100644 --- a/apps/api/src/services/heartbeat_report_service.py +++ b/apps/api/src/services/heartbeat_report_service.py @@ -15,7 +15,7 @@ HeartbeatReportService — ADR-073 心跳監控重構 import asyncio import html from dataclasses import dataclass, field -from datetime import datetime, timedelta +from datetime import datetime from typing import Optional import httpx @@ -420,8 +420,8 @@ class HeartbeatReportService: try: # KM 向量化率(DB 查詢) from src.db.base import get_db_context - from src.db.models import IncidentRecord, KnowledgeEntryRecord - from sqlalchemy import func, select + from src.db.models import KnowledgeEntryRecord + from sqlalchemy import func, select, text as sa_text async with get_db_context() as db: # KM 總數 km_total = await db.scalar(select(func.count()).select_from(KnowledgeEntryRecord)) @@ -436,20 +436,22 @@ class HeartbeatReportService: stats.km_vectorized = vec_result.scalar() or 0 # 24h 修復統計 - since = datetime.utcnow() - timedelta(hours=24) - outcomes = await db.execute( - select(IncidentRecord.outcome).where( - IncidentRecord.created_at >= since, - IncidentRecord.outcome.isnot(None), - ) - ) - outcome_list = [r[0] for r in outcomes.all() if r[0]] - stats.attempt_24h = len(outcome_list) - stats.success_24h = sum( - 1 for o in outcome_list - if isinstance(o, dict) and o.get("execution_success") - or isinstance(o, str) and "success" in o.lower() + # 2026-05-06 ogt + Codex: + # incidents.outcome 已不是自動修復 source of truth。實際執行紀錄 + # 寫在 auto_repair_executions;舊查詢會讓心跳報告顯示 0/15, + # 造成「全系統正常」但飛輪 KPI 失真的假象。 + repair_result = await db.execute( + sa_text(""" + SELECT + COUNT(*) FILTER (WHERE success IS TRUE) AS success, + COUNT(*) AS total + FROM auto_repair_executions + WHERE created_at >= NOW() - interval '24 hours' + """) ) + repair_row = repair_result.one() + stats.success_24h = int(repair_row.success or 0) + stats.attempt_24h = int(repair_row.total or 0) # 最後學習活動 last_km = await db.scalar( @@ -865,9 +867,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: lines.append("☸️ Kubernetes Pods") for i, pod in enumerate(report.pods): prefix = "└─" if i == len(report.pods) - 1 else "├─" - ready_icon = "✅" if pod.ready else "❌" + ready_icon = "✅" if pod.ready or pod.status in ("Succeeded", "Completed") else "❌" restart_str = f" (重啟×{pod.restarts})" if pod.restarts > 0 else "" - lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}") + status_str = "" if pod.ready else f" {html.escape(pod.status)}" + lines.append(f"{prefix} {ready_icon} {html.escape(pod.name[:35])}{restart_str}{status_str}") # --- Scanner 狀態 --- if report.scanners.last_runs: diff --git a/apps/api/src/services/platform_operator_service.py b/apps/api/src/services/platform_operator_service.py index f91c9031..b72a2d2f 100644 --- a/apps/api/src/services/platform_operator_service.py +++ b/apps/api/src/services/platform_operator_service.py @@ -9,8 +9,6 @@ ADR-106(AwoooP Agent Platform) from __future__ import annotations import uuid -from datetime import datetime -from decimal import Decimal from typing import Any from uuid import UUID @@ -153,8 +151,21 @@ async def list_runs( # Approvals # ============================================================================= -async def list_approvals(project_id: str | None) -> dict[str, Any]: - """列出所有 waiting_approval 狀態的 runs。""" +async def list_approvals( + project_id: str | None, + run_id: str | None = None, +) -> dict[str, Any]: + """列出 waiting_approval runs,可依 project_id 或 run_id 篩選。""" + run_uuid: UUID | None = None + if run_id: + try: + run_uuid = uuid.UUID(run_id) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"run_id 格式錯誤: {exc}", + ) from exc + async with get_db_context("awoooi") as db: stmt = ( select(AwoooPRunState) @@ -163,6 +174,8 @@ async def list_approvals(project_id: str | None) -> dict[str, Any]: ) if project_id is not None: stmt = stmt.where(AwoooPRunState.project_id == project_id) + if run_uuid is not None: + stmt = stmt.where(AwoooPRunState.run_id == run_uuid) count_stmt = select(func.count()).select_from(stmt.subquery()) total_result = await db.execute(count_stmt) diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index de008cc5..efc8fa39 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -67,7 +67,8 @@ "operations": "Operations", "securityCompliance": "Security & Compliance", "classicAICenter": "Classic AI Center", - "governance": "AI Governance" + "governance": "AI Governance", + "awooop": "AwoooP" }, "locale": { "switch": "Switch Language", @@ -1480,4 +1481,4 @@ "retry": "Retry" } } -} \ No newline at end of file +} diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index c3011b41..4e27c776 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -67,7 +67,8 @@ "operations": "營運", "securityCompliance": "安全合規", "classicAICenter": "經典 AI 中心", - "governance": "AI 治理" + "governance": "AI 治理", + "awooop": "AwoooP" }, "locale": { "switch": "切換語系", @@ -1481,4 +1482,4 @@ "retry": "重試" } } -} \ No newline at end of file +} diff --git a/apps/web/src/app/[locale]/awooop/approvals/page.tsx b/apps/web/src/app/[locale]/awooop/approvals/page.tsx index 2755f72b..2508f7b8 100644 --- a/apps/web/src/app/[locale]/awooop/approvals/page.tsx +++ b/apps/web/src/app/[locale]/awooop/approvals/page.tsx @@ -114,7 +114,7 @@ function ApprovalRow({ approval }: { approval: Approval }) { @@ -232,7 +232,7 @@ export default function ApprovalsPage() { {/* Error State */} {error && ( -
+