fix(governance): stabilize adr100 km growth slo

2026-05-14 19:33:52 +08:00
parent cdb8bf6802
commit d2a4a17969
9 changed files with 267 additions and 30 deletions
--- a/apps/api/src/services/adr100_slo_metrics_service.py
+++ b/apps/api/src/services/adr100_slo_metrics_service.py
@@ -33,8 +33,11 @@ class VerificationSample:
@dataclass(frozen=True)
 class Adr100SloMetricsSnapshot:
    automation_operations: list[AutomationOperationSample] = field(default_factory=list)
+    automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list)
    post_execution_verifications: list[VerificationSample] = field(default_factory=list)
+    post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list)
    knowledge_entries_total: int = 0
+    knowledge_entries_created_24h: int = 0
    high_confidence_total: int = 0
    high_confidence_success_total: int = 0
    emitted_at: float = field(default_factory=time)
@@ -52,13 +55,33 @@ class Adr100SloMetricsService:
            automation_rows = (
                await db.execute(text(_AUTOMATION_OPERATION_SQL))
            ).fetchall()
+            automation_24h_rows = (
+                await db.execute(text(_AUTOMATION_OPERATION_24H_SQL))
+            ).fetchall()
            verification_rows = (
                await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
            ).fetchall()
+            verification_24h_rows = (
+                await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL))
+            ).fetchall()
            knowledge_total = int(
                (await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
                or 0
            )
+            knowledge_created_24h = int(
+                (
+                    await db.execute(
+                        text(
+                            """
+                            SELECT count(*)
+                            FROM knowledge_entries
+                            WHERE created_at >= NOW() - INTERVAL '24 hours'
+                            """
+                        )
+                    )
+                ).scalar()
+                or 0
+            )
            confidence_row = (
                await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
            ).one()
@@ -72,6 +95,14 @@ class Adr100SloMetricsService:
                )
                for row in automation_rows
            ],
+            automation_operations_24h=[
+                AutomationOperationSample(
+                    outcome=str(row.outcome),
+                    operation_type=str(row.operation_type),
+                    count=int(row.count or 0),
+                )
+                for row in automation_24h_rows
+            ],
            post_execution_verifications=[
                VerificationSample(
                    outcome=str(row.outcome),
@@ -79,7 +110,15 @@ class Adr100SloMetricsService:
                )
                for row in verification_rows
            ],
+            post_execution_verifications_24h=[
+                VerificationSample(
+                    outcome=str(row.outcome),
+                    count=int(row.count or 0),
+                )
+                for row in verification_24h_rows
+            ],
            knowledge_entries_total=knowledge_total,
+            knowledge_entries_created_24h=knowledge_created_24h,
            high_confidence_total=int(confidence_row.high_confidence_total or 0),
            high_confidence_success_total=int(
                confidence_row.high_confidence_success_total or 0
@@ -107,6 +146,23 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
            'automation_operation_log_total{outcome="none",operation_type="none"} 0'
        )

+    lines.extend([
+        "# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards",
+        "# TYPE automation_operation_created_24h gauge",
+    ])
+    if snapshot.automation_operations_24h:
+        for sample in snapshot.automation_operations_24h:
+            lines.append(
+                "automation_operation_created_24h"
+                f'{{outcome="{_escape_label(sample.outcome)}",'
+                f'operation_type="{_escape_label(sample.operation_type)}"}} '
+                f"{sample.count}"
+            )
+    else:
+        lines.append(
+            'automation_operation_created_24h{outcome="none",operation_type="none"} 0'
+        )
+
    lines.extend([
        "# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
        "# TYPE post_execution_verification_total counter",
@@ -120,10 +176,26 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
    else:
        lines.append('post_execution_verification_total{outcome="none"} 0')

+    lines.extend([
+        "# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards",
+        "# TYPE post_execution_verification_created_24h gauge",
+    ])
+    if snapshot.post_execution_verifications_24h:
+        for sample in snapshot.post_execution_verifications_24h:
+            lines.append(
+                "post_execution_verification_created_24h"
+                f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
+            )
+    else:
+        lines.append('post_execution_verification_created_24h{outcome="none"} 0')
+
    lines.extend([
        "# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
        "# TYPE knowledge_entries_total counter",
        f"knowledge_entries_total {snapshot.knowledge_entries_total}",
+        "# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs",
+        "# TYPE knowledge_entries_created_24h gauge",
+        f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}",
        "# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
        "# TYPE approval_records_high_confidence_total counter",
        f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
@@ -180,6 +252,43 @@ _AUTOMATION_OPERATION_SQL = """
 """


+_AUTOMATION_OPERATION_24H_SQL = """
+    WITH automation_scope AS (
+        SELECT
+            CASE
+                WHEN status <> 'success' THEN status
+                WHEN actor = 'approval_execution'
+                     AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
+                    THEN 'human_required'
+                ELSE 'auto_executed'
+            END AS outcome,
+            operation_type
+        FROM automation_operation_log
+        WHERE operation_type IN (
+            'playbook_executed',
+            'remediation_executed',
+            'remediation_verified',
+            'remediation_rolled_back',
+            'self_correction_attempted'
+        )
+          AND created_at >= NOW() - INTERVAL '24 hours'
+        UNION ALL
+        SELECT
+            CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
+            'auto_repair_executed' AS operation_type
+        FROM auto_repair_executions
+        WHERE created_at >= NOW() - INTERVAL '24 hours'
+    )
+    SELECT
+        outcome,
+        operation_type,
+        count(*) AS count
+    FROM automation_scope
+    GROUP BY outcome, operation_type
+    ORDER BY outcome, operation_type
+"""
+
+
 _POST_EXECUTION_VERIFICATION_SQL = """
    SELECT verification_result AS outcome, count(*) AS count
    FROM incident_evidence
@@ -189,6 +298,16 @@ _POST_EXECUTION_VERIFICATION_SQL = """
 """


+_POST_EXECUTION_VERIFICATION_24H_SQL = """
+    SELECT verification_result AS outcome, count(*) AS count
+    FROM incident_evidence
+    WHERE verification_result IS NOT NULL
+      AND collected_at >= NOW() - INTERVAL '24 hours'
+    GROUP BY verification_result
+    ORDER BY verification_result
+"""
+
+
 _HIGH_CONFIDENCE_APPROVAL_SQL = """
    WITH approval_confidence AS (
        SELECT
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -394,7 +394,7 @@ class GovernanceAgent:
        SLO 1 自主化率:     sli:autonomy_rate:5m      硬紅線 < 0.70
        SLO 2 決策準確率:   sli:decision_accuracy:5m  硬紅線 < 0.85
        SLO 3 信心校準:     sli:confidence_calibration:1h 硬紅線 < 0.70
-        SLO 4 KM 增長率:    sli:km_growth_rate:24h    硬紅線 < 5
+        SLO 4 KM 增長率:    knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5

        2026-04-27 P3.4 by Claude — AI SLO（ADR-100）
        """
@@ -409,7 +409,7 @@ class GovernanceAgent:
            "autonomy_rate": "sli:autonomy_rate:5m",
            "decision_accuracy": "sli:decision_accuracy:5m",
            "confidence_calibration": "sli:confidence_calibration:1h",
-            "km_growth_rate": "sli:km_growth_rate:24h",
+            "km_growth_rate": "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h",
        }
        # 硬紅線：低於此值必須告警（非軟性警告）
        hard_red_lines: dict[str, float] = {