fix(governance): stabilize adr100 km growth slo
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-05-14 19:33:52 +08:00
parent cdb8bf6802
commit d2a4a17969
9 changed files with 267 additions and 30 deletions

View File

@@ -33,8 +33,11 @@ class VerificationSample:
@dataclass(frozen=True)
class Adr100SloMetricsSnapshot:
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list)
post_execution_verifications: list[VerificationSample] = field(default_factory=list)
post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list)
knowledge_entries_total: int = 0
knowledge_entries_created_24h: int = 0
high_confidence_total: int = 0
high_confidence_success_total: int = 0
emitted_at: float = field(default_factory=time)
@@ -52,13 +55,33 @@ class Adr100SloMetricsService:
automation_rows = (
await db.execute(text(_AUTOMATION_OPERATION_SQL))
).fetchall()
automation_24h_rows = (
await db.execute(text(_AUTOMATION_OPERATION_24H_SQL))
).fetchall()
verification_rows = (
await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
).fetchall()
verification_24h_rows = (
await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL))
).fetchall()
knowledge_total = int(
(await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
or 0
)
knowledge_created_24h = int(
(
await db.execute(
text(
"""
SELECT count(*)
FROM knowledge_entries
WHERE created_at >= NOW() - INTERVAL '24 hours'
"""
)
)
).scalar()
or 0
)
confidence_row = (
await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
).one()
@@ -72,6 +95,14 @@ class Adr100SloMetricsService:
)
for row in automation_rows
],
automation_operations_24h=[
AutomationOperationSample(
outcome=str(row.outcome),
operation_type=str(row.operation_type),
count=int(row.count or 0),
)
for row in automation_24h_rows
],
post_execution_verifications=[
VerificationSample(
outcome=str(row.outcome),
@@ -79,7 +110,15 @@ class Adr100SloMetricsService:
)
for row in verification_rows
],
post_execution_verifications_24h=[
VerificationSample(
outcome=str(row.outcome),
count=int(row.count or 0),
)
for row in verification_24h_rows
],
knowledge_entries_total=knowledge_total,
knowledge_entries_created_24h=knowledge_created_24h,
high_confidence_total=int(confidence_row.high_confidence_total or 0),
high_confidence_success_total=int(
confidence_row.high_confidence_success_total or 0
@@ -107,6 +146,23 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
'automation_operation_log_total{outcome="none",operation_type="none"} 0'
)
lines.extend([
"# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards",
"# TYPE automation_operation_created_24h gauge",
])
if snapshot.automation_operations_24h:
for sample in snapshot.automation_operations_24h:
lines.append(
"automation_operation_created_24h"
f'{{outcome="{_escape_label(sample.outcome)}",'
f'operation_type="{_escape_label(sample.operation_type)}"}} '
f"{sample.count}"
)
else:
lines.append(
'automation_operation_created_24h{outcome="none",operation_type="none"} 0'
)
lines.extend([
"# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
"# TYPE post_execution_verification_total counter",
@@ -120,10 +176,26 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
else:
lines.append('post_execution_verification_total{outcome="none"} 0')
lines.extend([
"# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards",
"# TYPE post_execution_verification_created_24h gauge",
])
if snapshot.post_execution_verifications_24h:
for sample in snapshot.post_execution_verifications_24h:
lines.append(
"post_execution_verification_created_24h"
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
)
else:
lines.append('post_execution_verification_created_24h{outcome="none"} 0')
lines.extend([
"# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
"# TYPE knowledge_entries_total counter",
f"knowledge_entries_total {snapshot.knowledge_entries_total}",
"# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs",
"# TYPE knowledge_entries_created_24h gauge",
f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}",
"# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
"# TYPE approval_records_high_confidence_total counter",
f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
@@ -180,6 +252,43 @@ _AUTOMATION_OPERATION_SQL = """
"""
_AUTOMATION_OPERATION_24H_SQL = """
WITH automation_scope AS (
SELECT
CASE
WHEN status <> 'success' THEN status
WHEN actor = 'approval_execution'
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
THEN 'human_required'
ELSE 'auto_executed'
END AS outcome,
operation_type
FROM automation_operation_log
WHERE operation_type IN (
'playbook_executed',
'remediation_executed',
'remediation_verified',
'remediation_rolled_back',
'self_correction_attempted'
)
AND created_at >= NOW() - INTERVAL '24 hours'
UNION ALL
SELECT
CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
'auto_repair_executed' AS operation_type
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
)
SELECT
outcome,
operation_type,
count(*) AS count
FROM automation_scope
GROUP BY outcome, operation_type
ORDER BY outcome, operation_type
"""
_POST_EXECUTION_VERIFICATION_SQL = """
SELECT verification_result AS outcome, count(*) AS count
FROM incident_evidence
@@ -189,6 +298,16 @@ _POST_EXECUTION_VERIFICATION_SQL = """
"""
_POST_EXECUTION_VERIFICATION_24H_SQL = """
SELECT verification_result AS outcome, count(*) AS count
FROM incident_evidence
WHERE verification_result IS NOT NULL
AND collected_at >= NOW() - INTERVAL '24 hours'
GROUP BY verification_result
ORDER BY verification_result
"""
_HIGH_CONFIDENCE_APPROVAL_SQL = """
WITH approval_confidence AS (
SELECT

View File

@@ -394,7 +394,7 @@ class GovernanceAgent:
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
SLO 4 KM 增長率: sli:km_growth_rate:24h 硬紅線 < 5
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
2026-04-27 P3.4 by Claude — AI SLOADR-100
"""
@@ -409,7 +409,7 @@ class GovernanceAgent:
"autonomy_rate": "sli:autonomy_rate:5m",
"decision_accuracy": "sli:decision_accuracy:5m",
"confidence_calibration": "sli:confidence_calibration:1h",
"km_growth_rate": "sli:km_growth_rate:24h",
"km_growth_rate": "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h",
}
# 硬紅線:低於此值必須告警(非軟性警告)
hard_red_lines: dict[str, float] = {