fix(api): add quality summary slo metric
All checks were successful
CD Pipeline / tests (push) Successful in 1m19s
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / build-and-deploy (push) Successful in 3m31s
CD Pipeline / post-deploy-checks (push) Successful in 1m29s

This commit is contained in:
Your Name
2026-06-01 17:00:50 +08:00
parent 9954e97710
commit d6c904dd0f
9 changed files with 377 additions and 27 deletions

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
from time import perf_counter
from typing import Any
from fastapi import APIRouter, Depends, Query
@@ -13,6 +14,7 @@ from src.core.awooop_operator_auth import (
from src.services.awooop_truth_chain_service import (
fetch_automation_quality_summary,
fetch_truth_chain,
record_quality_summary_observation,
)
router = APIRouter()
@@ -33,12 +35,25 @@ async def get_automation_quality_summary(
limit: int = Query(200, ge=1, le=500, description="最多評估 incident 數"),
refresh: bool = Query(False, description="略過短 TTL 快取並重新聚合"),
) -> dict[str, Any]:
summary = await fetch_automation_quality_summary(
project_id=project_id,
hours=hours,
limit=limit,
refresh=refresh,
)
started_at = perf_counter()
try:
summary = await fetch_automation_quality_summary(
project_id=project_id,
hours=hours,
limit=limit,
refresh=refresh,
)
except Exception as exc:
record_quality_summary_observation(
project_id=project_id,
hours=hours,
limit=limit,
cache_status="error",
success=False,
duration_seconds=perf_counter() - started_at,
error=exc.__class__.__name__,
)
raise
summary["examples"] = []
summary["visibility_note"] = (
"Aggregate only. Use /truth-chain/{source_id} with operator auth for source-level details."

View File

@@ -15,6 +15,7 @@ from time import time
from sqlalchemy import text
from src.db.base import get_db_context
from src.services.awooop_truth_chain_service import get_quality_summary_observations
@dataclass(frozen=True)
@@ -30,6 +31,18 @@ class VerificationSample:
count: int
@dataclass(frozen=True)
class QualitySummaryObservation:
project_id: str
hours: int
limit: int
cache_status: str
success: bool
duration_seconds: float
observed_at: float
error: str | None = None
@dataclass(frozen=True)
class Adr100SloMetricsSnapshot:
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
@@ -40,6 +53,7 @@ class Adr100SloMetricsSnapshot:
knowledge_entries_created_24h: int = 0
high_confidence_total: int = 0
high_confidence_success_total: int = 0
quality_summary_observations: list[QualitySummaryObservation] = field(default_factory=list)
emitted_at: float = field(default_factory=time)
@@ -123,6 +137,23 @@ class Adr100SloMetricsService:
high_confidence_success_total=int(
confidence_row.high_confidence_success_total or 0
),
quality_summary_observations=[
QualitySummaryObservation(
project_id=str(row.get("project_id") or "awoooi"),
hours=int(row.get("hours") or 0),
limit=int(row.get("limit") or 0),
cache_status=str(row.get("cache_status") or "unknown"),
success=bool(row.get("success")),
duration_seconds=float(row.get("duration_seconds") or 0.0),
observed_at=float(row.get("observed_at") or 0.0),
error=(
str(row.get("error"))
if row.get("error") is not None
else None
),
)
for row in get_quality_summary_observations()
],
)
@@ -208,8 +239,56 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
"# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp",
"# TYPE adr100_slo_emitter_last_success_timestamp gauge",
f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}",
"",
])
lines.extend([
"# HELP awooop_truth_chain_quality_summary_last_duration_seconds Last observed AwoooP truth-chain quality summary aggregation duration",
"# TYPE awooop_truth_chain_quality_summary_last_duration_seconds gauge",
])
if snapshot.quality_summary_observations:
for observation in snapshot.quality_summary_observations:
labels = _quality_summary_labels(observation)
lines.append(
"awooop_truth_chain_quality_summary_last_duration_seconds"
f"{labels} {observation.duration_seconds:.6f}"
)
else:
lines.append(
'awooop_truth_chain_quality_summary_last_duration_seconds{project_id="none",hours="0",limit="0",cache_status="none",success="false"} 0'
)
lines.extend([
"# HELP awooop_truth_chain_quality_summary_last_success Last observed AwoooP truth-chain quality summary success flag",
"# TYPE awooop_truth_chain_quality_summary_last_success gauge",
])
if snapshot.quality_summary_observations:
for observation in snapshot.quality_summary_observations:
labels = _quality_summary_labels(observation)
lines.append(
"awooop_truth_chain_quality_summary_last_success"
f"{labels} {1 if observation.success else 0}"
)
else:
lines.append(
'awooop_truth_chain_quality_summary_last_success{project_id="none",hours="0",limit="0",cache_status="none",success="false"} 0'
)
lines.extend([
"# HELP awooop_truth_chain_quality_summary_observed_timestamp Last observed AwoooP truth-chain quality summary timestamp",
"# TYPE awooop_truth_chain_quality_summary_observed_timestamp gauge",
])
if snapshot.quality_summary_observations:
for observation in snapshot.quality_summary_observations:
labels = _quality_summary_labels(observation)
lines.append(
"awooop_truth_chain_quality_summary_observed_timestamp"
f"{labels} {observation.observed_at:.0f}"
)
else:
lines.append(
'awooop_truth_chain_quality_summary_observed_timestamp{project_id="none",hours="0",limit="0",cache_status="none",success="false"} 0'
)
lines.append("")
return "\n".join(lines)
@@ -217,6 +296,18 @@ def _escape_label(value: str) -> str:
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
def _quality_summary_labels(observation: QualitySummaryObservation) -> str:
return (
"{"
f'project_id="{_escape_label(observation.project_id)}",'
f'hours="{observation.hours}",'
f'limit="{observation.limit}",'
f'cache_status="{_escape_label(observation.cache_status)}",'
f'success="{"true" if observation.success else "false"}"'
"}"
)
_AUTOMATION_OPERATION_SQL = """
WITH automation_scope AS (
SELECT

View File

@@ -80,6 +80,16 @@ ADR100_SLO_DEFINITIONS: tuple[Adr100SloDefinition, ...] = (
unit="count",
window="24h",
),
Adr100SloDefinition(
name="truth_chain_quality_summary_latency",
query='max(awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",limit="8",success="true"})',
target=2.0,
hard_red_line=8.0,
direction="below",
unit="seconds",
window="last_observation",
minimum_events=0.0,
),
)

View File

@@ -13,6 +13,7 @@ import shutil
from datetime import UTC, date, datetime, timedelta
from decimal import Decimal
from pathlib import Path
from time import perf_counter, time
from typing import Any
from uuid import UUID
@@ -37,6 +38,54 @@ _JSON_TEXT_FIELDS = {"gate_result", "source_envelope"}
_QUALITY_SUMMARY_CACHE_TTL_SECONDS = int(
os.getenv("AWOOOP_QUALITY_SUMMARY_CACHE_TTL_SECONDS", "30")
)
_QUALITY_SUMMARY_OBSERVATIONS: dict[str, dict[str, Any]] = {}
def record_quality_summary_observation(
*,
project_id: str,
hours: int,
limit: int,
cache_status: str,
success: bool,
duration_seconds: float,
error: str | None = None,
) -> None:
normalized_project_id = project_id or "awoooi"
normalized_cache_status = cache_status or "unknown"
key = "|".join([
normalized_project_id,
str(int(hours)),
str(int(limit)),
normalized_cache_status,
"success" if success else "failed",
])
_QUALITY_SUMMARY_OBSERVATIONS[key] = {
"project_id": normalized_project_id,
"hours": int(hours),
"limit": int(limit),
"cache_status": normalized_cache_status,
"success": bool(success),
"duration_seconds": max(0.0, float(duration_seconds)),
"observed_at": time(),
"error": str(error)[:160] if error else None,
}
def get_quality_summary_observations() -> list[dict[str, Any]]:
return [
dict(observation)
for observation in sorted(
_QUALITY_SUMMARY_OBSERVATIONS.values(),
key=lambda item: (
str(item.get("project_id") or ""),
int(item.get("hours") or 0),
int(item.get("limit") or 0),
str(item.get("cache_status") or ""),
bool(item.get("success")),
),
)
]
def _clean(value: Any) -> Any:
@@ -2079,6 +2128,7 @@ async def fetch_automation_quality_summary(
refresh: bool = False,
) -> dict[str, Any]:
"""Return a recent incident-level quality summary for the automation flywheel."""
started_at = perf_counter()
bounded_hours = max(1, min(int(hours), 168))
bounded_limit = max(1, min(int(limit), 500))
normalized_project_id = project_id or "awoooi"
@@ -2094,13 +2144,26 @@ async def fetch_automation_quality_summary(
ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
)
if cached_summary is not None:
duration_seconds = perf_counter() - started_at
record_quality_summary_observation(
project_id=normalized_project_id,
hours=bounded_hours,
limit=bounded_limit,
cache_status="hit",
success=True,
duration_seconds=duration_seconds,
)
logger.info(
"awooop_automation_quality_summary_cache_hit",
project_id=normalized_project_id,
window_hours=bounded_hours,
limit=bounded_limit,
ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
duration_seconds=round(duration_seconds, 3),
)
cached_summary = dict(cached_summary)
cached_summary["cache_status"] = "hit"
cached_summary["aggregation_duration_seconds"] = round(duration_seconds, 3)
return cached_summary
cutoff = datetime.now(UTC) - timedelta(hours=bounded_hours)
@@ -2525,9 +2588,22 @@ async def fetch_automation_quality_summary(
cache_status="miss",
cache_ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
)
return await store_operator_summary_async(
stored_summary = await store_operator_summary_async(
"truth_chain_quality_summary",
cache_key,
summary,
ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
)
duration_seconds = perf_counter() - started_at
record_quality_summary_observation(
project_id=normalized_project_id,
hours=bounded_hours,
limit=bounded_limit,
cache_status="miss",
success=True,
duration_seconds=duration_seconds,
)
stored_summary = dict(stored_summary)
stored_summary["cache_status"] = "miss"
stored_summary["aggregation_duration_seconds"] = round(duration_seconds, 3)
return stored_summary

View File

@@ -5,7 +5,7 @@
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
5. slo_compliance — 4 個 SLO 合規性檢查ADR-100違反時降級飛輪行為
5. slo_compliance — 5 個 SLO 合規性檢查ADR-100違反時降級飛輪行為
所有 check 互相隔離try/except任一失敗不阻斷其他項目。
@@ -57,6 +57,36 @@ RECENT_LIMIT = 100 # 最近幾筆做統計
GOVERNANCE_SELF_CHECK_LEASE_KEY = "governance:self_check:cycle_lease"
def _slo_remediation_items(name: str) -> list[str]:
if name == "truth_chain_quality_summary_latency":
return [
"Check truth-chain quality summary cache miss latency and DB query plan",
"Confirm operator summary cache is warm before treating homepage SLO as degraded",
]
return [
"Pause auto-scaling or risky auto-fix tasks",
"Review evidence/decision traces and adjust policy thresholds",
]
def _slo_actionable_items(name: str) -> list[str]:
if name == "truth_chain_quality_summary_latency":
return [
"Call /api/v1/platform/truth-chain/quality/summary?limit=8&refresh=true and compare duration",
"Inspect /metrics for awooop_truth_chain_quality_summary_last_duration_seconds",
]
return [
"Check verifier lag and post-exec learning health",
"Run emergency incident audit on failed approvals",
]
def _slo_next_action(name: str) -> str:
if name == "truth_chain_quality_summary_latency":
return "run_truth_chain_quality_summary_latency_probe"
return "trigger_flywheel_safeguard"
# =============================================================================
# GovernanceAgent
# =============================================================================
@@ -421,7 +451,7 @@ class GovernanceAgent:
# =========================================================================
async def check_slo_compliance(self) -> dict[str, Any]:
"""SLO 4 項合規性檢查 — 違反時降級飛輪行為
"""SLO 5 項合規性檢查 — 違反時降級飛輪行為
從 Prometheus Recording rules 讀取 SLI 值,
與硬紅線閾值比對,違反時呼叫 _alert() 寫 PG + 推 Telegram。
@@ -430,6 +460,7 @@ class GovernanceAgent:
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
SLO 5 總覽延遲: awooop_truth_chain_quality_summary_last_duration_seconds 硬紅線 > 8s
2026-04-27 P3.4 by Claude — AI SLOADR-100
"""
@@ -446,13 +477,15 @@ class GovernanceAgent:
"decision_accuracy": "sli:decision_accuracy:5m",
"confidence_calibration": "sli:confidence_calibration:1h",
"km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
"truth_chain_quality_summary_latency": 'max(awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",limit="8",success="true"})',
}
# 硬紅線:低於此值必須告警(非軟性警告)
# 硬紅線:above 指標低於此值、below 指標高於此值時必須告警(非軟性警告)
hard_red_lines: dict[str, float] = {
"autonomy_rate": 0.70,
"decision_accuracy": 0.85,
"confidence_calibration": 0.70,
"km_growth_rate": 5.0,
"truth_chain_quality_summary_latency": 8.0,
}
# SLO 目標值(供日誌記錄)
slo_targets: dict[str, float] = {
@@ -460,6 +493,14 @@ class GovernanceAgent:
"decision_accuracy": 0.90,
"confidence_calibration": 0.80,
"km_growth_rate": 20.0,
"truth_chain_quality_summary_latency": 2.0,
}
slo_directions: dict[str, str] = {
"autonomy_rate": "above",
"decision_accuracy": "above",
"confidence_calibration": "above",
"km_growth_rate": "above",
"truth_chain_quality_summary_latency": "below",
}
results: dict[str, Any] = {}
@@ -511,7 +552,17 @@ class GovernanceAgent:
continue
threshold = hard_red_lines[name]
target = slo_targets[name]
violated = value < threshold
direction = slo_directions.get(name, "above")
violated = value > threshold if direction == "below" else value < threshold
gap = (
value - threshold
if violated and direction == "below"
else threshold - value
if violated
else target - value
if direction == "below"
else value - target
)
results[name] = {
"name": name,
@@ -519,7 +570,8 @@ class GovernanceAgent:
"value": round(value, 4),
"slo_target": target,
"hard_red_line": threshold,
"gap": round(threshold - value, 4) if violated else round(value - target, 4),
"direction": direction,
"gap": round(gap, 4),
"violated": violated,
}
@@ -533,20 +585,15 @@ class GovernanceAgent:
"value": round(value, 4),
"target": target,
"threshold": threshold,
"gap": round(threshold - value, 4),
"direction": direction,
"gap": round(gap, 4),
},
"remediation": {
"items": [
"Pause auto-scaling or risky auto-fix tasks",
"Review evidence/decision traces and adjust policy thresholds",
],
"next_action": "trigger_flywheel_safeguard",
"items": _slo_remediation_items(name),
"next_action": _slo_next_action(name),
},
"actionable": {
"items": [
"Check verifier lag and post-exec learning health",
"Run emergency incident audit on failed approvals",
],
"items": _slo_actionable_items(name),
},
},
)
@@ -716,7 +763,7 @@ class GovernanceAgent:
"actionable": {
"items": [
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
"檢查 Prometheus rule 與 truth-chain quality summary runtime metric 是否可查詢",
],
},
},

View File

@@ -1,6 +1,7 @@
from src.services.adr100_slo_metrics_service import (
Adr100SloMetricsSnapshot,
AutomationOperationSample,
QualitySummaryObservation,
VerificationSample,
render_adr100_slo_metrics,
)
@@ -38,6 +39,17 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
knowledge_entries_created_24h=25,
high_confidence_total=9,
high_confidence_success_total=7,
quality_summary_observations=[
QualitySummaryObservation(
project_id="awoooi",
hours=24,
limit=8,
cache_status="miss",
success=True,
duration_seconds=1.234567,
observed_at=1_778_756_100,
),
],
emitted_at=1_778_756_000,
)
@@ -58,6 +70,18 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
assert "approval_records_high_confidence_total 9" in rendered
assert "approval_records_high_confidence_success_total 7" in rendered
assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
assert (
'awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",'
'hours="24",limit="8",cache_status="miss",success="true"} 1.234567'
) in rendered
assert (
'awooop_truth_chain_quality_summary_last_success{project_id="awoooi",'
'hours="24",limit="8",cache_status="miss",success="true"} 1'
) in rendered
assert (
'awooop_truth_chain_quality_summary_observed_timestamp{project_id="awoooi",'
'hours="24",limit="8",cache_status="miss",success="true"} 1778756100'
) in rendered
def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
@@ -71,6 +95,10 @@ def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
assert 'post_execution_verification_created_24h{outcome="none"} 0' in rendered
assert "knowledge_entries_total 0" in rendered
assert "knowledge_entries_created_24h 0" in rendered
assert (
'awooop_truth_chain_quality_summary_last_duration_seconds{project_id="none",'
'hours="0",limit="0",cache_status="none",success="false"} 0'
) in rendered
def test_render_adr100_slo_metrics_escapes_labels() -> None:

View File

@@ -10,6 +10,12 @@ from src.services.adr100_slo_status_service import (
)
QUALITY_SUMMARY_LATENCY_QUERY = (
'max(awooop_truth_chain_quality_summary_last_duration_seconds{'
'project_id="awoooi",limit="8",success="true"})'
)
class _FakePrometheusResponse:
def __init__(self, payload: dict[str, Any]) -> None:
self._payload = payload
@@ -89,6 +95,7 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
"sum(rate(approval_records_high_confidence_total[1h]))": "0",
"max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "24",
QUALITY_SUMMARY_LATENCY_QUERY: "1.2",
}
monkeypatch.setattr(
@@ -109,6 +116,8 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
assert by_name["confidence_calibration"]["status"] == "skipped_low_volume"
assert by_name["km_growth_rate"]["status"] == "ok"
assert by_name["km_growth_rate"]["value"] == 24
assert by_name["truth_chain_quality_summary_latency"]["status"] == "ok"
assert by_name["truth_chain_quality_summary_latency"]["direction"] == "below"
assert report["overall_status"] == "partial"
assert report["overall_compliance"] == 1.0
@@ -121,6 +130,7 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
"sum(rate(approval_records_high_confidence_total[1h]))": "0",
"max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "3",
QUALITY_SUMMARY_LATENCY_QUERY: "1.2",
}
monkeypatch.setattr(
@@ -142,6 +152,36 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
assert report["overall_status"] == "violated"
@pytest.mark.asyncio
async def test_fetch_report_classifies_below_direction_slo(monkeypatch):
values = {
"sum(rate(automation_operation_log_total[5m]))": "0",
'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
"sum(rate(approval_records_high_confidence_total[1h]))": "0",
"max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "24",
QUALITY_SUMMARY_LATENCY_QUERY: "9.5",
}
monkeypatch.setattr(
"httpx.AsyncClient",
lambda *args, **kwargs: _FakePrometheusClient(values),
)
monkeypatch.setattr(
Adr100SloStatusService,
"_fetch_verification_coverage",
_low_volume_coverage,
)
report = await Adr100SloStatusService().fetch_report()
by_name = {metric["name"]: metric for metric in report["metrics"]}
latency = by_name["truth_chain_quality_summary_latency"]
assert latency["status"] == "violated"
assert latency["direction"] == "below"
assert latency["value"] == 9.5
assert report["overall_status"] == "violated"
def test_verification_coverage_payload_flags_backlog():
payload = _build_verification_coverage_payload(
{

View File

@@ -797,6 +797,12 @@ class TestRunSelfCheckGlobalFailureAlert:
assert "governance_self_failure" not in calls
QUALITY_SUMMARY_LATENCY_QUERY = (
'max(awooop_truth_chain_quality_summary_last_duration_seconds{'
'project_id="awoooi",limit="8",success="true"})'
)
class _FakePrometheusResponse:
def __init__(self, value: str) -> None:
self._value = value
@@ -809,7 +815,7 @@ class _FakePrometheusResponse:
class _FakePrometheusClient:
def __init__(self, value: str) -> None:
def __init__(self, value: str | dict[str, str]) -> None:
self._value = value
self.queries: list[str] = []
@@ -820,7 +826,10 @@ class _FakePrometheusClient:
return False
async def get(self, *args, **kwargs): # noqa: ANN002, ANN003
self.queries.append(str(kwargs.get("params", {}).get("query", "")))
query = str(kwargs.get("params", {}).get("query", ""))
self.queries.append(query)
if isinstance(self._value, dict):
return _FakePrometheusResponse(self._value.get(query, "NaN"))
return _FakePrometheusResponse(self._value)
@@ -838,6 +847,7 @@ class TestCheckSloCompliance:
"decision_accuracy",
"confidence_calibration",
"km_growth_rate",
"truth_chain_quality_summary_latency",
):
assert result[name]["status"] == "skipped"
assert result[name]["reason"] == "prometheus_nan_or_inf"
@@ -847,7 +857,13 @@ class TestCheckSloCompliance:
async def test_km_growth_prefers_db_derived_24h_gauge(self):
"""KM SLO 要優先使用 DB 24h gauge避免新 counter 暖機時誤報 0."""
agent = _make_agent()
client = _FakePrometheusClient("25")
client = _FakePrometheusClient({
"sli:autonomy_rate:5m": "0.95",
"sli:decision_accuracy:5m": "0.96",
"sli:confidence_calibration:1h": "0.97",
"max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "25",
QUALITY_SUMMARY_LATENCY_QUERY: "1.1",
})
with patch("httpx.AsyncClient", return_value=client):
result = await agent.check_slo_compliance()
@@ -855,3 +871,4 @@ class TestCheckSloCompliance:
assert "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)" in client.queries
assert result["km_growth_rate"]["status"] == "ok"
assert result["km_growth_rate"]["value"] == 25
assert result["truth_chain_quality_summary_latency"]["status"] == "ok"

View File

@@ -1,3 +1,29 @@
## 2026-06-01truth-chain quality summary 納入 AI 自健診 SLO
**背景**
- 正式環境已把 `/api/v1/platform/truth-chain/quality/summary` 的 N+1 查詢修成批次化,但「首頁/quality summary 是否又變慢」尚未進入 AI 自健診。
- 先前飛輪核心異常只會看到泛化的 `auto_execute_success_rate`,無法快速判斷是治理資料、執行資料,還是 operator summary 資料面拖慢。
**本次調整**
- `apps/api/src/services/awooop_truth_chain_service.py`:記錄 quality summary 的 cache hit / miss 聚合耗時與最後觀測時間。
- `apps/api/src/api/v1/platform/truth_chain.py`:端點例外時也寫入 failure observation`/metrics` 能看見摘要面失敗。
- `apps/api/src/services/adr100_slo_metrics_service.py`:新增 `awooop_truth_chain_quality_summary_last_duration_seconds``awooop_truth_chain_quality_summary_last_success``awooop_truth_chain_quality_summary_observed_timestamp`
- `apps/api/src/services/adr100_slo_status_service.py`:新增第 5 個 ADR-100 SLO`truth_chain_quality_summary_latency`,目標 `< 2s`、硬紅線 `> 8s`
- `apps/api/src/services/governance_agent.py`SLO 判斷支援 `above` / `below` 方向,避免把 latency 這種「越低越好」的指標誤判。
**驗證**
- `python3 -m py_compile apps/api/src/services/awooop_truth_chain_service.py apps/api/src/services/adr100_slo_metrics_service.py apps/api/src/services/adr100_slo_status_service.py apps/api/src/services/governance_agent.py apps/api/src/api/v1/platform/truth_chain.py`
- `python3 scripts/security/security-mirror-progress-guard.py --root .``SECURITY_MIRROR_PROGRESS_GUARD_OK`
- `DATABASE_URL=postgresql://test:test@localhost:5432/test PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_adr100_slo_metrics_service.py apps/api/tests/test_adr100_slo_status_service.py apps/api/tests/test_governance_agent.py apps/api/tests/test_awooop_truth_chain_service.py -q``85 passed`
**進度邊界**
- 整體 AI 自動化飛輪進度仍維持 `61%`;這輪是自健診可觀測性與 SLO 精準度補強,不代表自動修復成功率已提升。
- 下一步需推 Gitea main、等待 production deploy並以正式 `/metrics` / `/api/v1/ai/slo` 驗證新 SLO 是否被 Prometheus 抓到。
## 2026-06-01IwoooS 首層漸進揭露使用體驗收斂
**背景**