fix(governance): stabilize adr100 km growth slo
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -33,8 +33,11 @@ class VerificationSample:
|
||||
@dataclass(frozen=True)
|
||||
class Adr100SloMetricsSnapshot:
|
||||
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
|
||||
automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list)
|
||||
post_execution_verifications: list[VerificationSample] = field(default_factory=list)
|
||||
post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list)
|
||||
knowledge_entries_total: int = 0
|
||||
knowledge_entries_created_24h: int = 0
|
||||
high_confidence_total: int = 0
|
||||
high_confidence_success_total: int = 0
|
||||
emitted_at: float = field(default_factory=time)
|
||||
@@ -52,13 +55,33 @@ class Adr100SloMetricsService:
|
||||
automation_rows = (
|
||||
await db.execute(text(_AUTOMATION_OPERATION_SQL))
|
||||
).fetchall()
|
||||
automation_24h_rows = (
|
||||
await db.execute(text(_AUTOMATION_OPERATION_24H_SQL))
|
||||
).fetchall()
|
||||
verification_rows = (
|
||||
await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
|
||||
).fetchall()
|
||||
verification_24h_rows = (
|
||||
await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL))
|
||||
).fetchall()
|
||||
knowledge_total = int(
|
||||
(await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
|
||||
or 0
|
||||
)
|
||||
knowledge_created_24h = int(
|
||||
(
|
||||
await db.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT count(*)
|
||||
FROM knowledge_entries
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
"""
|
||||
)
|
||||
)
|
||||
).scalar()
|
||||
or 0
|
||||
)
|
||||
confidence_row = (
|
||||
await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
|
||||
).one()
|
||||
@@ -72,6 +95,14 @@ class Adr100SloMetricsService:
|
||||
)
|
||||
for row in automation_rows
|
||||
],
|
||||
automation_operations_24h=[
|
||||
AutomationOperationSample(
|
||||
outcome=str(row.outcome),
|
||||
operation_type=str(row.operation_type),
|
||||
count=int(row.count or 0),
|
||||
)
|
||||
for row in automation_24h_rows
|
||||
],
|
||||
post_execution_verifications=[
|
||||
VerificationSample(
|
||||
outcome=str(row.outcome),
|
||||
@@ -79,7 +110,15 @@ class Adr100SloMetricsService:
|
||||
)
|
||||
for row in verification_rows
|
||||
],
|
||||
post_execution_verifications_24h=[
|
||||
VerificationSample(
|
||||
outcome=str(row.outcome),
|
||||
count=int(row.count or 0),
|
||||
)
|
||||
for row in verification_24h_rows
|
||||
],
|
||||
knowledge_entries_total=knowledge_total,
|
||||
knowledge_entries_created_24h=knowledge_created_24h,
|
||||
high_confidence_total=int(confidence_row.high_confidence_total or 0),
|
||||
high_confidence_success_total=int(
|
||||
confidence_row.high_confidence_success_total or 0
|
||||
@@ -107,6 +146,23 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
|
||||
'automation_operation_log_total{outcome="none",operation_type="none"} 0'
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards",
|
||||
"# TYPE automation_operation_created_24h gauge",
|
||||
])
|
||||
if snapshot.automation_operations_24h:
|
||||
for sample in snapshot.automation_operations_24h:
|
||||
lines.append(
|
||||
"automation_operation_created_24h"
|
||||
f'{{outcome="{_escape_label(sample.outcome)}",'
|
||||
f'operation_type="{_escape_label(sample.operation_type)}"}} '
|
||||
f"{sample.count}"
|
||||
)
|
||||
else:
|
||||
lines.append(
|
||||
'automation_operation_created_24h{outcome="none",operation_type="none"} 0'
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
|
||||
"# TYPE post_execution_verification_total counter",
|
||||
@@ -120,10 +176,26 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
|
||||
else:
|
||||
lines.append('post_execution_verification_total{outcome="none"} 0')
|
||||
|
||||
lines.extend([
|
||||
"# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards",
|
||||
"# TYPE post_execution_verification_created_24h gauge",
|
||||
])
|
||||
if snapshot.post_execution_verifications_24h:
|
||||
for sample in snapshot.post_execution_verifications_24h:
|
||||
lines.append(
|
||||
"post_execution_verification_created_24h"
|
||||
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
|
||||
)
|
||||
else:
|
||||
lines.append('post_execution_verification_created_24h{outcome="none"} 0')
|
||||
|
||||
lines.extend([
|
||||
"# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
|
||||
"# TYPE knowledge_entries_total counter",
|
||||
f"knowledge_entries_total {snapshot.knowledge_entries_total}",
|
||||
"# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs",
|
||||
"# TYPE knowledge_entries_created_24h gauge",
|
||||
f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}",
|
||||
"# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
|
||||
"# TYPE approval_records_high_confidence_total counter",
|
||||
f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
|
||||
@@ -180,6 +252,43 @@ _AUTOMATION_OPERATION_SQL = """
|
||||
"""
|
||||
|
||||
|
||||
_AUTOMATION_OPERATION_24H_SQL = """
|
||||
WITH automation_scope AS (
|
||||
SELECT
|
||||
CASE
|
||||
WHEN status <> 'success' THEN status
|
||||
WHEN actor = 'approval_execution'
|
||||
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
|
||||
THEN 'human_required'
|
||||
ELSE 'auto_executed'
|
||||
END AS outcome,
|
||||
operation_type
|
||||
FROM automation_operation_log
|
||||
WHERE operation_type IN (
|
||||
'playbook_executed',
|
||||
'remediation_executed',
|
||||
'remediation_verified',
|
||||
'remediation_rolled_back',
|
||||
'self_correction_attempted'
|
||||
)
|
||||
AND created_at >= NOW() - INTERVAL '24 hours'
|
||||
UNION ALL
|
||||
SELECT
|
||||
CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
|
||||
'auto_repair_executed' AS operation_type
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
)
|
||||
SELECT
|
||||
outcome,
|
||||
operation_type,
|
||||
count(*) AS count
|
||||
FROM automation_scope
|
||||
GROUP BY outcome, operation_type
|
||||
ORDER BY outcome, operation_type
|
||||
"""
|
||||
|
||||
|
||||
_POST_EXECUTION_VERIFICATION_SQL = """
|
||||
SELECT verification_result AS outcome, count(*) AS count
|
||||
FROM incident_evidence
|
||||
@@ -189,6 +298,16 @@ _POST_EXECUTION_VERIFICATION_SQL = """
|
||||
"""
|
||||
|
||||
|
||||
_POST_EXECUTION_VERIFICATION_24H_SQL = """
|
||||
SELECT verification_result AS outcome, count(*) AS count
|
||||
FROM incident_evidence
|
||||
WHERE verification_result IS NOT NULL
|
||||
AND collected_at >= NOW() - INTERVAL '24 hours'
|
||||
GROUP BY verification_result
|
||||
ORDER BY verification_result
|
||||
"""
|
||||
|
||||
|
||||
_HIGH_CONFIDENCE_APPROVAL_SQL = """
|
||||
WITH approval_confidence AS (
|
||||
SELECT
|
||||
|
||||
@@ -394,7 +394,7 @@ class GovernanceAgent:
|
||||
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
|
||||
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
|
||||
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
|
||||
SLO 4 KM 增長率: sli:km_growth_rate:24h 硬紅線 < 5
|
||||
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
|
||||
|
||||
2026-04-27 P3.4 by Claude — AI SLO(ADR-100)
|
||||
"""
|
||||
@@ -409,7 +409,7 @@ class GovernanceAgent:
|
||||
"autonomy_rate": "sli:autonomy_rate:5m",
|
||||
"decision_accuracy": "sli:decision_accuracy:5m",
|
||||
"confidence_calibration": "sli:confidence_calibration:1h",
|
||||
"km_growth_rate": "sli:km_growth_rate:24h",
|
||||
"km_growth_rate": "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h",
|
||||
}
|
||||
# 硬紅線:低於此值必須告警(非軟性警告)
|
||||
hard_red_lines: dict[str, float] = {
|
||||
|
||||
Reference in New Issue
Block a user