fix(api): add quality summary slo metric

2026-06-01 17:00:50 +08:00
parent 9954e97710
commit d6c904dd0f
9 changed files with 377 additions and 27 deletions
--- a/apps/api/src/api/v1/platform/truth_chain.py
+++ b/apps/api/src/api/v1/platform/truth_chain.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+from time import perf_counter
 from typing import Any

 from fastapi import APIRouter, Depends, Query
@@ -13,6 +14,7 @@ from src.core.awooop_operator_auth import (
 from src.services.awooop_truth_chain_service import (
    fetch_automation_quality_summary,
    fetch_truth_chain,
+    record_quality_summary_observation,
 )

 router = APIRouter()
@@ -33,12 +35,25 @@ async def get_automation_quality_summary(
    limit: int = Query(200, ge=1, le=500, description="最多評估 incident 數"),
    refresh: bool = Query(False, description="略過短 TTL 快取並重新聚合"),
 ) -> dict[str, Any]:
-    summary = await fetch_automation_quality_summary(
-        project_id=project_id,
-        hours=hours,
-        limit=limit,
-        refresh=refresh,
-    )
+    started_at = perf_counter()
+    try:
+        summary = await fetch_automation_quality_summary(
+            project_id=project_id,
+            hours=hours,
+            limit=limit,
+            refresh=refresh,
+        )
+    except Exception as exc:
+        record_quality_summary_observation(
+            project_id=project_id,
+            hours=hours,
+            limit=limit,
+            cache_status="error",
+            success=False,
+            duration_seconds=perf_counter() - started_at,
+            error=exc.__class__.__name__,
+        )
+        raise
    summary["examples"] = []
    summary["visibility_note"] = (
        "Aggregate only. Use /truth-chain/{source_id} with operator auth for source-level details."
--- a/apps/api/src/services/adr100_slo_metrics_service.py
+++ b/apps/api/src/services/adr100_slo_metrics_service.py
@@ -15,6 +15,7 @@ from time import time
 from sqlalchemy import text

 from src.db.base import get_db_context
+from src.services.awooop_truth_chain_service import get_quality_summary_observations


@dataclass(frozen=True)
@@ -30,6 +31,18 @@ class VerificationSample:
    count: int


+@dataclass(frozen=True)
+class QualitySummaryObservation:
+    project_id: str
+    hours: int
+    limit: int
+    cache_status: str
+    success: bool
+    duration_seconds: float
+    observed_at: float
+    error: str | None = None
+
+
@dataclass(frozen=True)
 class Adr100SloMetricsSnapshot:
    automation_operations: list[AutomationOperationSample] = field(default_factory=list)
@@ -40,6 +53,7 @@ class Adr100SloMetricsSnapshot:
    knowledge_entries_created_24h: int = 0
    high_confidence_total: int = 0
    high_confidence_success_total: int = 0
+    quality_summary_observations: list[QualitySummaryObservation] = field(default_factory=list)
    emitted_at: float = field(default_factory=time)


@@ -123,6 +137,23 @@ class Adr100SloMetricsService:
            high_confidence_success_total=int(
                confidence_row.high_confidence_success_total or 0
            ),
+            quality_summary_observations=[
+                QualitySummaryObservation(
+                    project_id=str(row.get("project_id") or "awoooi"),
+                    hours=int(row.get("hours") or 0),
+                    limit=int(row.get("limit") or 0),
+                    cache_status=str(row.get("cache_status") or "unknown"),
+                    success=bool(row.get("success")),
+                    duration_seconds=float(row.get("duration_seconds") or 0.0),
+                    observed_at=float(row.get("observed_at") or 0.0),
+                    error=(
+                        str(row.get("error"))
+                        if row.get("error") is not None
+                        else None
+                    ),
+                )
+                for row in get_quality_summary_observations()
+            ],
        )


@@ -208,8 +239,56 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
        "# HELP adr100_slo_emitter_last_success_timestamp Last successful ADR-100 DB metrics emission timestamp",
        "# TYPE adr100_slo_emitter_last_success_timestamp gauge",
        f"adr100_slo_emitter_last_success_timestamp {snapshot.emitted_at:.0f}",
-        "",
    ])
+    lines.extend([
+        "# HELP awooop_truth_chain_quality_summary_last_duration_seconds Last observed AwoooP truth-chain quality summary aggregation duration",
+        "# TYPE awooop_truth_chain_quality_summary_last_duration_seconds gauge",
+    ])
+    if snapshot.quality_summary_observations:
+        for observation in snapshot.quality_summary_observations:
+            labels = _quality_summary_labels(observation)
+            lines.append(
+                "awooop_truth_chain_quality_summary_last_duration_seconds"
+                f"{labels} {observation.duration_seconds:.6f}"
+            )
+    else:
+        lines.append(
+            'awooop_truth_chain_quality_summary_last_duration_seconds{project_id="none",hours="0",limit="0",cache_status="none",success="false"} 0'
+        )
+
+    lines.extend([
+        "# HELP awooop_truth_chain_quality_summary_last_success Last observed AwoooP truth-chain quality summary success flag",
+        "# TYPE awooop_truth_chain_quality_summary_last_success gauge",
+    ])
+    if snapshot.quality_summary_observations:
+        for observation in snapshot.quality_summary_observations:
+            labels = _quality_summary_labels(observation)
+            lines.append(
+                "awooop_truth_chain_quality_summary_last_success"
+                f"{labels} {1 if observation.success else 0}"
+            )
+    else:
+        lines.append(
+            'awooop_truth_chain_quality_summary_last_success{project_id="none",hours="0",limit="0",cache_status="none",success="false"} 0'
+        )
+
+    lines.extend([
+        "# HELP awooop_truth_chain_quality_summary_observed_timestamp Last observed AwoooP truth-chain quality summary timestamp",
+        "# TYPE awooop_truth_chain_quality_summary_observed_timestamp gauge",
+    ])
+    if snapshot.quality_summary_observations:
+        for observation in snapshot.quality_summary_observations:
+            labels = _quality_summary_labels(observation)
+            lines.append(
+                "awooop_truth_chain_quality_summary_observed_timestamp"
+                f"{labels} {observation.observed_at:.0f}"
+            )
+    else:
+        lines.append(
+            'awooop_truth_chain_quality_summary_observed_timestamp{project_id="none",hours="0",limit="0",cache_status="none",success="false"} 0'
+        )
+
+    lines.append("")
    return "\n".join(lines)


@@ -217,6 +296,18 @@ def _escape_label(value: str) -> str:
    return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')


+def _quality_summary_labels(observation: QualitySummaryObservation) -> str:
+    return (
+        "{"
+        f'project_id="{_escape_label(observation.project_id)}",'
+        f'hours="{observation.hours}",'
+        f'limit="{observation.limit}",'
+        f'cache_status="{_escape_label(observation.cache_status)}",'
+        f'success="{"true" if observation.success else "false"}"'
+        "}"
+    )
+
+
 _AUTOMATION_OPERATION_SQL = """
    WITH automation_scope AS (
        SELECT
--- a/apps/api/src/services/adr100_slo_status_service.py
+++ b/apps/api/src/services/adr100_slo_status_service.py
@@ -80,6 +80,16 @@ ADR100_SLO_DEFINITIONS: tuple[Adr100SloDefinition, ...] = (
        unit="count",
        window="24h",
    ),
+    Adr100SloDefinition(
+        name="truth_chain_quality_summary_latency",
+        query='max(awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",limit="8",success="true"})',
+        target=2.0,
+        hard_red_line=8.0,
+        direction="below",
+        unit="seconds",
+        window="last_observation",
+        minimum_events=0.0,
+    ),
 )


--- a/apps/api/src/services/awooop_truth_chain_service.py
+++ b/apps/api/src/services/awooop_truth_chain_service.py
@@ -13,6 +13,7 @@ import shutil
 from datetime import UTC, date, datetime, timedelta
 from decimal import Decimal
 from pathlib import Path
+from time import perf_counter, time
 from typing import Any
 from uuid import UUID

@@ -37,6 +38,54 @@ _JSON_TEXT_FIELDS = {"gate_result", "source_envelope"}
 _QUALITY_SUMMARY_CACHE_TTL_SECONDS = int(
    os.getenv("AWOOOP_QUALITY_SUMMARY_CACHE_TTL_SECONDS", "30")
 )
+_QUALITY_SUMMARY_OBSERVATIONS: dict[str, dict[str, Any]] = {}
+
+
+def record_quality_summary_observation(
+    *,
+    project_id: str,
+    hours: int,
+    limit: int,
+    cache_status: str,
+    success: bool,
+    duration_seconds: float,
+    error: str | None = None,
+) -> None:
+    normalized_project_id = project_id or "awoooi"
+    normalized_cache_status = cache_status or "unknown"
+    key = "|".join([
+        normalized_project_id,
+        str(int(hours)),
+        str(int(limit)),
+        normalized_cache_status,
+        "success" if success else "failed",
+    ])
+    _QUALITY_SUMMARY_OBSERVATIONS[key] = {
+        "project_id": normalized_project_id,
+        "hours": int(hours),
+        "limit": int(limit),
+        "cache_status": normalized_cache_status,
+        "success": bool(success),
+        "duration_seconds": max(0.0, float(duration_seconds)),
+        "observed_at": time(),
+        "error": str(error)[:160] if error else None,
+    }
+
+
+def get_quality_summary_observations() -> list[dict[str, Any]]:
+    return [
+        dict(observation)
+        for observation in sorted(
+            _QUALITY_SUMMARY_OBSERVATIONS.values(),
+            key=lambda item: (
+                str(item.get("project_id") or ""),
+                int(item.get("hours") or 0),
+                int(item.get("limit") or 0),
+                str(item.get("cache_status") or ""),
+                bool(item.get("success")),
+            ),
+        )
+    ]


 def _clean(value: Any) -> Any:
@@ -2079,6 +2128,7 @@ async def fetch_automation_quality_summary(
    refresh: bool = False,
 ) -> dict[str, Any]:
    """Return a recent incident-level quality summary for the automation flywheel."""
+    started_at = perf_counter()
    bounded_hours = max(1, min(int(hours), 168))
    bounded_limit = max(1, min(int(limit), 500))
    normalized_project_id = project_id or "awoooi"
@@ -2094,13 +2144,26 @@ async def fetch_automation_quality_summary(
            ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
        )
        if cached_summary is not None:
+            duration_seconds = perf_counter() - started_at
+            record_quality_summary_observation(
+                project_id=normalized_project_id,
+                hours=bounded_hours,
+                limit=bounded_limit,
+                cache_status="hit",
+                success=True,
+                duration_seconds=duration_seconds,
+            )
            logger.info(
                "awooop_automation_quality_summary_cache_hit",
                project_id=normalized_project_id,
                window_hours=bounded_hours,
                limit=bounded_limit,
                ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
+                duration_seconds=round(duration_seconds, 3),
            )
+            cached_summary = dict(cached_summary)
+            cached_summary["cache_status"] = "hit"
+            cached_summary["aggregation_duration_seconds"] = round(duration_seconds, 3)
            return cached_summary

    cutoff = datetime.now(UTC) - timedelta(hours=bounded_hours)
@@ -2525,9 +2588,22 @@ async def fetch_automation_quality_summary(
        cache_status="miss",
        cache_ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
    )
-    return await store_operator_summary_async(
+    stored_summary = await store_operator_summary_async(
        "truth_chain_quality_summary",
        cache_key,
        summary,
        ttl_seconds=_QUALITY_SUMMARY_CACHE_TTL_SECONDS,
    )
+    duration_seconds = perf_counter() - started_at
+    record_quality_summary_observation(
+        project_id=normalized_project_id,
+        hours=bounded_hours,
+        limit=bounded_limit,
+        cache_status="miss",
+        success=True,
+        duration_seconds=duration_seconds,
+    )
+    stored_summary = dict(stored_summary)
+    stored_summary["cache_status"] = "miss"
+    stored_summary["aggregation_duration_seconds"] = round(duration_seconds, 3)
+    return stored_summary
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -5,7 +5,7 @@
 2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
 3. llm_hallucination   — 近 100 筆 evidence verification_result=failed 比例 > 10%
 4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
-5. slo_compliance      — 4 個 SLO 合規性檢查（ADR-100），違反時降級飛輪行為
+5. slo_compliance      — 5 個 SLO 合規性檢查（ADR-100），違反時降級飛輪行為

 所有 check 互相隔離（try/except），任一失敗不阻斷其他項目。

@@ -57,6 +57,36 @@ RECENT_LIMIT = 100                    # 最近幾筆做統計
 GOVERNANCE_SELF_CHECK_LEASE_KEY = "governance:self_check:cycle_lease"


+def _slo_remediation_items(name: str) -> list[str]:
+    if name == "truth_chain_quality_summary_latency":
+        return [
+            "Check truth-chain quality summary cache miss latency and DB query plan",
+            "Confirm operator summary cache is warm before treating homepage SLO as degraded",
+        ]
+    return [
+        "Pause auto-scaling or risky auto-fix tasks",
+        "Review evidence/decision traces and adjust policy thresholds",
+    ]
+
+
+def _slo_actionable_items(name: str) -> list[str]:
+    if name == "truth_chain_quality_summary_latency":
+        return [
+            "Call /api/v1/platform/truth-chain/quality/summary?limit=8&refresh=true and compare duration",
+            "Inspect /metrics for awooop_truth_chain_quality_summary_last_duration_seconds",
+        ]
+    return [
+        "Check verifier lag and post-exec learning health",
+        "Run emergency incident audit on failed approvals",
+    ]
+
+
+def _slo_next_action(name: str) -> str:
+    if name == "truth_chain_quality_summary_latency":
+        return "run_truth_chain_quality_summary_latency_probe"
+    return "trigger_flywheel_safeguard"
+
+
 # =============================================================================
 # GovernanceAgent
 # =============================================================================
@@ -421,7 +451,7 @@ class GovernanceAgent:
    # =========================================================================

    async def check_slo_compliance(self) -> dict[str, Any]:
-        """SLO 4 項合規性檢查 — 違反時降級飛輪行為
+        """SLO 5 項合規性檢查 — 違反時降級飛輪行為

        從 Prometheus Recording rules 讀取 SLI 值，
        與硬紅線閾值比對，違反時呼叫 _alert() 寫 PG + 推 Telegram。
@@ -430,6 +460,7 @@ class GovernanceAgent:
        SLO 2 決策準確率:   sli:decision_accuracy:5m  硬紅線 < 0.85
        SLO 3 信心校準:     sli:confidence_calibration:1h 硬紅線 < 0.70
        SLO 4 KM 增長率:    knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
+        SLO 5 總覽延遲:     awooop_truth_chain_quality_summary_last_duration_seconds 硬紅線 > 8s

        2026-04-27 P3.4 by Claude — AI SLO（ADR-100）
        """
@@ -446,13 +477,15 @@ class GovernanceAgent:
            "decision_accuracy": "sli:decision_accuracy:5m",
            "confidence_calibration": "sli:confidence_calibration:1h",
            "km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
+            "truth_chain_quality_summary_latency": 'max(awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",limit="8",success="true"})',
        }
-        # 硬紅線：低於此值必須告警（非軟性警告）
+        # 硬紅線：above 指標低於此值、below 指標高於此值時必須告警（非軟性警告）
        hard_red_lines: dict[str, float] = {
            "autonomy_rate": 0.70,
            "decision_accuracy": 0.85,
            "confidence_calibration": 0.70,
            "km_growth_rate": 5.0,
+            "truth_chain_quality_summary_latency": 8.0,
        }
        # SLO 目標值（供日誌記錄）
        slo_targets: dict[str, float] = {
@@ -460,6 +493,14 @@ class GovernanceAgent:
            "decision_accuracy": 0.90,
            "confidence_calibration": 0.80,
            "km_growth_rate": 20.0,
+            "truth_chain_quality_summary_latency": 2.0,
+        }
+        slo_directions: dict[str, str] = {
+            "autonomy_rate": "above",
+            "decision_accuracy": "above",
+            "confidence_calibration": "above",
+            "km_growth_rate": "above",
+            "truth_chain_quality_summary_latency": "below",
        }

        results: dict[str, Any] = {}
@@ -511,7 +552,17 @@ class GovernanceAgent:
                            continue
                        threshold = hard_red_lines[name]
                        target = slo_targets[name]
-                        violated = value < threshold
+                        direction = slo_directions.get(name, "above")
+                        violated = value > threshold if direction == "below" else value < threshold
+                        gap = (
+                            value - threshold
+                            if violated and direction == "below"
+                            else threshold - value
+                            if violated
+                            else target - value
+                            if direction == "below"
+                            else value - target
+                        )

                        results[name] = {
                            "name": name,
@@ -519,7 +570,8 @@ class GovernanceAgent:
                            "value": round(value, 4),
                            "slo_target": target,
                            "hard_red_line": threshold,
-                            "gap": round(threshold - value, 4) if violated else round(value - target, 4),
+                            "direction": direction,
+                            "gap": round(gap, 4),
                            "violated": violated,
                        }

@@ -533,20 +585,15 @@ class GovernanceAgent:
                                        "value": round(value, 4),
                                        "target": target,
                                        "threshold": threshold,
-                                        "gap": round(threshold - value, 4),
+                                        "direction": direction,
+                                        "gap": round(gap, 4),
                                    },
                                    "remediation": {
-                                        "items": [
-                                            "Pause auto-scaling or risky auto-fix tasks",
-                                            "Review evidence/decision traces and adjust policy thresholds",
-                                        ],
-                                        "next_action": "trigger_flywheel_safeguard",
+                                        "items": _slo_remediation_items(name),
+                                        "next_action": _slo_next_action(name),
                                    },
                                    "actionable": {
-                                        "items": [
-                                            "Check verifier lag and post-exec learning health",
-                                            "Run emergency incident audit on failed approvals",
-                                        ],
+                                        "items": _slo_actionable_items(name),
                                    },
                                },
                            )
@@ -716,7 +763,7 @@ class GovernanceAgent:
                        "actionable": {
                            "items": [
                                "先確認 /metrics 是否已輸出 ADR-100 底層指標",
-                                "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
+                                "檢查 Prometheus rule 與 truth-chain quality summary runtime metric 是否可查詢",
                            ],
                        },
                    },
--- a/apps/api/tests/test_adr100_slo_metrics_service.py
+++ b/apps/api/tests/test_adr100_slo_metrics_service.py
@@ -1,6 +1,7 @@
 from src.services.adr100_slo_metrics_service import (
    Adr100SloMetricsSnapshot,
    AutomationOperationSample,
+    QualitySummaryObservation,
    VerificationSample,
    render_adr100_slo_metrics,
 )
@@ -38,6 +39,17 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
        knowledge_entries_created_24h=25,
        high_confidence_total=9,
        high_confidence_success_total=7,
+        quality_summary_observations=[
+            QualitySummaryObservation(
+                project_id="awoooi",
+                hours=24,
+                limit=8,
+                cache_status="miss",
+                success=True,
+                duration_seconds=1.234567,
+                observed_at=1_778_756_100,
+            ),
+        ],
        emitted_at=1_778_756_000,
    )

@@ -58,6 +70,18 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
    assert "approval_records_high_confidence_total 9" in rendered
    assert "approval_records_high_confidence_success_total 7" in rendered
    assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
+    assert (
+        'awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",'
+        'hours="24",limit="8",cache_status="miss",success="true"} 1.234567'
+    ) in rendered
+    assert (
+        'awooop_truth_chain_quality_summary_last_success{project_id="awoooi",'
+        'hours="24",limit="8",cache_status="miss",success="true"} 1'
+    ) in rendered
+    assert (
+        'awooop_truth_chain_quality_summary_observed_timestamp{project_id="awoooi",'
+        'hours="24",limit="8",cache_status="miss",success="true"} 1778756100'
+    ) in rendered


 def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
@@ -71,6 +95,10 @@ def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
    assert 'post_execution_verification_created_24h{outcome="none"} 0' in rendered
    assert "knowledge_entries_total 0" in rendered
    assert "knowledge_entries_created_24h 0" in rendered
+    assert (
+        'awooop_truth_chain_quality_summary_last_duration_seconds{project_id="none",'
+        'hours="0",limit="0",cache_status="none",success="false"} 0'
+    ) in rendered


 def test_render_adr100_slo_metrics_escapes_labels() -> None:
--- a/apps/api/tests/test_adr100_slo_status_service.py
+++ b/apps/api/tests/test_adr100_slo_status_service.py
@@ -10,6 +10,12 @@ from src.services.adr100_slo_status_service import (
 )


+QUALITY_SUMMARY_LATENCY_QUERY = (
+    'max(awooop_truth_chain_quality_summary_last_duration_seconds{'
+    'project_id="awoooi",limit="8",success="true"})'
+)
+
+
 class _FakePrometheusResponse:
    def __init__(self, payload: dict[str, Any]) -> None:
        self._payload = payload
@@ -89,6 +95,7 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
        'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
        "sum(rate(approval_records_high_confidence_total[1h]))": "0",
        "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "24",
+        QUALITY_SUMMARY_LATENCY_QUERY: "1.2",
    }

    monkeypatch.setattr(
@@ -109,6 +116,8 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
    assert by_name["confidence_calibration"]["status"] == "skipped_low_volume"
    assert by_name["km_growth_rate"]["status"] == "ok"
    assert by_name["km_growth_rate"]["value"] == 24
+    assert by_name["truth_chain_quality_summary_latency"]["status"] == "ok"
+    assert by_name["truth_chain_quality_summary_latency"]["direction"] == "below"
    assert report["overall_status"] == "partial"
    assert report["overall_compliance"] == 1.0

@@ -121,6 +130,7 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
        'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
        "sum(rate(approval_records_high_confidence_total[1h]))": "0",
        "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "3",
+        QUALITY_SUMMARY_LATENCY_QUERY: "1.2",
    }

    monkeypatch.setattr(
@@ -142,6 +152,36 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
    assert report["overall_status"] == "violated"


+@pytest.mark.asyncio
+async def test_fetch_report_classifies_below_direction_slo(monkeypatch):
+    values = {
+        "sum(rate(automation_operation_log_total[5m]))": "0",
+        'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
+        "sum(rate(approval_records_high_confidence_total[1h]))": "0",
+        "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "24",
+        QUALITY_SUMMARY_LATENCY_QUERY: "9.5",
+    }
+
+    monkeypatch.setattr(
+        "httpx.AsyncClient",
+        lambda *args, **kwargs: _FakePrometheusClient(values),
+    )
+    monkeypatch.setattr(
+        Adr100SloStatusService,
+        "_fetch_verification_coverage",
+        _low_volume_coverage,
+    )
+
+    report = await Adr100SloStatusService().fetch_report()
+
+    by_name = {metric["name"]: metric for metric in report["metrics"]}
+    latency = by_name["truth_chain_quality_summary_latency"]
+    assert latency["status"] == "violated"
+    assert latency["direction"] == "below"
+    assert latency["value"] == 9.5
+    assert report["overall_status"] == "violated"
+
+
 def test_verification_coverage_payload_flags_backlog():
    payload = _build_verification_coverage_payload(
        {
--- a/apps/api/tests/test_governance_agent.py
+++ b/apps/api/tests/test_governance_agent.py
@@ -797,6 +797,12 @@ class TestRunSelfCheckGlobalFailureAlert:
        assert "governance_self_failure" not in calls


+QUALITY_SUMMARY_LATENCY_QUERY = (
+    'max(awooop_truth_chain_quality_summary_last_duration_seconds{'
+    'project_id="awoooi",limit="8",success="true"})'
+)
+
+
 class _FakePrometheusResponse:
    def __init__(self, value: str) -> None:
        self._value = value
@@ -809,7 +815,7 @@ class _FakePrometheusResponse:


 class _FakePrometheusClient:
-    def __init__(self, value: str) -> None:
+    def __init__(self, value: str | dict[str, str]) -> None:
        self._value = value
        self.queries: list[str] = []

@@ -820,7 +826,10 @@ class _FakePrometheusClient:
        return False

    async def get(self, *args, **kwargs):  # noqa: ANN002, ANN003
-        self.queries.append(str(kwargs.get("params", {}).get("query", "")))
+        query = str(kwargs.get("params", {}).get("query", ""))
+        self.queries.append(query)
+        if isinstance(self._value, dict):
+            return _FakePrometheusResponse(self._value.get(query, "NaN"))
        return _FakePrometheusResponse(self._value)


@@ -838,6 +847,7 @@ class TestCheckSloCompliance:
            "decision_accuracy",
            "confidence_calibration",
            "km_growth_rate",
+            "truth_chain_quality_summary_latency",
        ):
            assert result[name]["status"] == "skipped"
            assert result[name]["reason"] == "prometheus_nan_or_inf"
@@ -847,7 +857,13 @@ class TestCheckSloCompliance:
    async def test_km_growth_prefers_db_derived_24h_gauge(self):
        """KM SLO 要優先使用 DB 24h gauge，避免新 counter 暖機時誤報 0."""
        agent = _make_agent()
-        client = _FakePrometheusClient("25")
+        client = _FakePrometheusClient({
+            "sli:autonomy_rate:5m": "0.95",
+            "sli:decision_accuracy:5m": "0.96",
+            "sli:confidence_calibration:1h": "0.97",
+            "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "25",
+            QUALITY_SUMMARY_LATENCY_QUERY: "1.1",
+        })

        with patch("httpx.AsyncClient", return_value=client):
            result = await agent.check_slo_compliance()
@@ -855,3 +871,4 @@ class TestCheckSloCompliance:
        assert "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)" in client.queries
        assert result["km_growth_rate"]["status"] == "ok"
        assert result["km_growth_rate"]["value"] == 25
+        assert result["truth_chain_quality_summary_latency"]["status"] == "ok"
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,29 @@
+## 2026-06-01｜truth-chain quality summary 納入 AI 自健診 SLO
+
+**背景**：
+
+- 正式環境已把 `/api/v1/platform/truth-chain/quality/summary` 的 N+1 查詢修成批次化，但「首頁/quality summary 是否又變慢」尚未進入 AI 自健診。
+- 先前飛輪核心異常只會看到泛化的 `auto_execute_success_rate`，無法快速判斷是治理資料、執行資料，還是 operator summary 資料面拖慢。
+
+**本次調整**：
+
+- `apps/api/src/services/awooop_truth_chain_service.py`：記錄 quality summary 的 cache hit / miss 聚合耗時與最後觀測時間。
+- `apps/api/src/api/v1/platform/truth_chain.py`：端點例外時也寫入 failure observation，讓 `/metrics` 能看見摘要面失敗。
+- `apps/api/src/services/adr100_slo_metrics_service.py`：新增 `awooop_truth_chain_quality_summary_last_duration_seconds`、`awooop_truth_chain_quality_summary_last_success`、`awooop_truth_chain_quality_summary_observed_timestamp`。
+- `apps/api/src/services/adr100_slo_status_service.py`：新增第 5 個 ADR-100 SLO：`truth_chain_quality_summary_latency`，目標 `< 2s`、硬紅線 `> 8s`。
+- `apps/api/src/services/governance_agent.py`：SLO 判斷支援 `above` / `below` 方向，避免把 latency 這種「越低越好」的指標誤判。
+
+**驗證**：
+
+- `python3 -m py_compile apps/api/src/services/awooop_truth_chain_service.py apps/api/src/services/adr100_slo_metrics_service.py apps/api/src/services/adr100_slo_status_service.py apps/api/src/services/governance_agent.py apps/api/src/api/v1/platform/truth_chain.py`
+- `python3 scripts/security/security-mirror-progress-guard.py --root .` → `SECURITY_MIRROR_PROGRESS_GUARD_OK`
+- `DATABASE_URL=postgresql://test:test@localhost:5432/test PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_adr100_slo_metrics_service.py apps/api/tests/test_adr100_slo_status_service.py apps/api/tests/test_governance_agent.py apps/api/tests/test_awooop_truth_chain_service.py -q` → `85 passed`
+
+**進度邊界**：
+
+- 整體 AI 自動化飛輪進度仍維持 `61%`；這輪是自健診可觀測性與 SLO 精準度補強，不代表自動修復成功率已提升。
+- 下一步需推 Gitea main、等待 production deploy，並以正式 `/metrics` / `/api/v1/ai/slo` 驗證新 SLO 是否被 Prometheus 抓到。
+
 ## 2026-06-01｜IwoooS 首層漸進揭露使用體驗收斂

 **背景**：