fix(governance): stabilize adr100 km growth slo
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# =============================================================================
|
||||
# Deploy Prometheus Alert Rules (獨立 workflow)
|
||||
# 2026-04-05 Claude Code (ADR-039 I3): 從 cd.yaml 分離
|
||||
# 觸發條件: ops/monitoring/alerts-unified.yml 有變更 或 workflow_dispatch
|
||||
# 觸發條件: ops/monitoring/alerts-unified.yml / slo-rules.yml 有變更 或 workflow_dispatch
|
||||
# 說明: 告警規則部署不依賴應用構建,獨立觸發以加快響應速度
|
||||
# =============================================================================
|
||||
|
||||
@@ -12,6 +12,8 @@ on:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'ops/monitoring/alerts-unified.yml'
|
||||
- 'ops/monitoring/slo-rules.yml'
|
||||
- 'scripts/ops/deploy-alerts.sh'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
@@ -30,6 +32,7 @@ jobs:
|
||||
run: |
|
||||
pip3 install -q pyyaml 2>/dev/null || pip install -q pyyaml
|
||||
python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/alerts-unified.yml')); print('YAML OK')"
|
||||
python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/slo-rules.yml')); print('SLO YAML OK')"
|
||||
|
||||
- name: Setup SSH key
|
||||
run: |
|
||||
|
||||
@@ -33,8 +33,11 @@ class VerificationSample:
|
||||
@dataclass(frozen=True)
|
||||
class Adr100SloMetricsSnapshot:
|
||||
automation_operations: list[AutomationOperationSample] = field(default_factory=list)
|
||||
automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list)
|
||||
post_execution_verifications: list[VerificationSample] = field(default_factory=list)
|
||||
post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list)
|
||||
knowledge_entries_total: int = 0
|
||||
knowledge_entries_created_24h: int = 0
|
||||
high_confidence_total: int = 0
|
||||
high_confidence_success_total: int = 0
|
||||
emitted_at: float = field(default_factory=time)
|
||||
@@ -52,13 +55,33 @@ class Adr100SloMetricsService:
|
||||
automation_rows = (
|
||||
await db.execute(text(_AUTOMATION_OPERATION_SQL))
|
||||
).fetchall()
|
||||
automation_24h_rows = (
|
||||
await db.execute(text(_AUTOMATION_OPERATION_24H_SQL))
|
||||
).fetchall()
|
||||
verification_rows = (
|
||||
await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
|
||||
).fetchall()
|
||||
verification_24h_rows = (
|
||||
await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL))
|
||||
).fetchall()
|
||||
knowledge_total = int(
|
||||
(await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
|
||||
or 0
|
||||
)
|
||||
knowledge_created_24h = int(
|
||||
(
|
||||
await db.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT count(*)
|
||||
FROM knowledge_entries
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
"""
|
||||
)
|
||||
)
|
||||
).scalar()
|
||||
or 0
|
||||
)
|
||||
confidence_row = (
|
||||
await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
|
||||
).one()
|
||||
@@ -72,6 +95,14 @@ class Adr100SloMetricsService:
|
||||
)
|
||||
for row in automation_rows
|
||||
],
|
||||
automation_operations_24h=[
|
||||
AutomationOperationSample(
|
||||
outcome=str(row.outcome),
|
||||
operation_type=str(row.operation_type),
|
||||
count=int(row.count or 0),
|
||||
)
|
||||
for row in automation_24h_rows
|
||||
],
|
||||
post_execution_verifications=[
|
||||
VerificationSample(
|
||||
outcome=str(row.outcome),
|
||||
@@ -79,7 +110,15 @@ class Adr100SloMetricsService:
|
||||
)
|
||||
for row in verification_rows
|
||||
],
|
||||
post_execution_verifications_24h=[
|
||||
VerificationSample(
|
||||
outcome=str(row.outcome),
|
||||
count=int(row.count or 0),
|
||||
)
|
||||
for row in verification_24h_rows
|
||||
],
|
||||
knowledge_entries_total=knowledge_total,
|
||||
knowledge_entries_created_24h=knowledge_created_24h,
|
||||
high_confidence_total=int(confidence_row.high_confidence_total or 0),
|
||||
high_confidence_success_total=int(
|
||||
confidence_row.high_confidence_success_total or 0
|
||||
@@ -107,6 +146,23 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
|
||||
'automation_operation_log_total{outcome="none",operation_type="none"} 0'
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards",
|
||||
"# TYPE automation_operation_created_24h gauge",
|
||||
])
|
||||
if snapshot.automation_operations_24h:
|
||||
for sample in snapshot.automation_operations_24h:
|
||||
lines.append(
|
||||
"automation_operation_created_24h"
|
||||
f'{{outcome="{_escape_label(sample.outcome)}",'
|
||||
f'operation_type="{_escape_label(sample.operation_type)}"}} '
|
||||
f"{sample.count}"
|
||||
)
|
||||
else:
|
||||
lines.append(
|
||||
'automation_operation_created_24h{outcome="none",operation_type="none"} 0'
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
|
||||
"# TYPE post_execution_verification_total counter",
|
||||
@@ -120,10 +176,26 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
|
||||
else:
|
||||
lines.append('post_execution_verification_total{outcome="none"} 0')
|
||||
|
||||
lines.extend([
|
||||
"# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards",
|
||||
"# TYPE post_execution_verification_created_24h gauge",
|
||||
])
|
||||
if snapshot.post_execution_verifications_24h:
|
||||
for sample in snapshot.post_execution_verifications_24h:
|
||||
lines.append(
|
||||
"post_execution_verification_created_24h"
|
||||
f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
|
||||
)
|
||||
else:
|
||||
lines.append('post_execution_verification_created_24h{outcome="none"} 0')
|
||||
|
||||
lines.extend([
|
||||
"# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
|
||||
"# TYPE knowledge_entries_total counter",
|
||||
f"knowledge_entries_total {snapshot.knowledge_entries_total}",
|
||||
"# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs",
|
||||
"# TYPE knowledge_entries_created_24h gauge",
|
||||
f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}",
|
||||
"# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
|
||||
"# TYPE approval_records_high_confidence_total counter",
|
||||
f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
|
||||
@@ -180,6 +252,43 @@ _AUTOMATION_OPERATION_SQL = """
|
||||
"""
|
||||
|
||||
|
||||
_AUTOMATION_OPERATION_24H_SQL = """
|
||||
WITH automation_scope AS (
|
||||
SELECT
|
||||
CASE
|
||||
WHEN status <> 'success' THEN status
|
||||
WHEN actor = 'approval_execution'
|
||||
AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
|
||||
THEN 'human_required'
|
||||
ELSE 'auto_executed'
|
||||
END AS outcome,
|
||||
operation_type
|
||||
FROM automation_operation_log
|
||||
WHERE operation_type IN (
|
||||
'playbook_executed',
|
||||
'remediation_executed',
|
||||
'remediation_verified',
|
||||
'remediation_rolled_back',
|
||||
'self_correction_attempted'
|
||||
)
|
||||
AND created_at >= NOW() - INTERVAL '24 hours'
|
||||
UNION ALL
|
||||
SELECT
|
||||
CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
|
||||
'auto_repair_executed' AS operation_type
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
)
|
||||
SELECT
|
||||
outcome,
|
||||
operation_type,
|
||||
count(*) AS count
|
||||
FROM automation_scope
|
||||
GROUP BY outcome, operation_type
|
||||
ORDER BY outcome, operation_type
|
||||
"""
|
||||
|
||||
|
||||
_POST_EXECUTION_VERIFICATION_SQL = """
|
||||
SELECT verification_result AS outcome, count(*) AS count
|
||||
FROM incident_evidence
|
||||
@@ -189,6 +298,16 @@ _POST_EXECUTION_VERIFICATION_SQL = """
|
||||
"""
|
||||
|
||||
|
||||
_POST_EXECUTION_VERIFICATION_24H_SQL = """
|
||||
SELECT verification_result AS outcome, count(*) AS count
|
||||
FROM incident_evidence
|
||||
WHERE verification_result IS NOT NULL
|
||||
AND collected_at >= NOW() - INTERVAL '24 hours'
|
||||
GROUP BY verification_result
|
||||
ORDER BY verification_result
|
||||
"""
|
||||
|
||||
|
||||
_HIGH_CONFIDENCE_APPROVAL_SQL = """
|
||||
WITH approval_confidence AS (
|
||||
SELECT
|
||||
|
||||
@@ -394,7 +394,7 @@ class GovernanceAgent:
|
||||
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
|
||||
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
|
||||
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
|
||||
SLO 4 KM 增長率: sli:km_growth_rate:24h 硬紅線 < 5
|
||||
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
|
||||
|
||||
2026-04-27 P3.4 by Claude — AI SLO(ADR-100)
|
||||
"""
|
||||
@@ -409,7 +409,7 @@ class GovernanceAgent:
|
||||
"autonomy_rate": "sli:autonomy_rate:5m",
|
||||
"decision_accuracy": "sli:decision_accuracy:5m",
|
||||
"confidence_calibration": "sli:confidence_calibration:1h",
|
||||
"km_growth_rate": "sli:km_growth_rate:24h",
|
||||
"km_growth_rate": "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h",
|
||||
}
|
||||
# 硬紅線:低於此值必須告警(非軟性警告)
|
||||
hard_red_lines: dict[str, float] = {
|
||||
|
||||
@@ -20,11 +20,22 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
|
||||
count=2,
|
||||
),
|
||||
],
|
||||
automation_operations_24h=[
|
||||
AutomationOperationSample(
|
||||
outcome="auto_executed",
|
||||
operation_type="auto_repair_executed",
|
||||
count=3,
|
||||
),
|
||||
],
|
||||
post_execution_verifications=[
|
||||
VerificationSample(outcome="success", count=7),
|
||||
VerificationSample(outcome="failed", count=1),
|
||||
],
|
||||
post_execution_verifications_24h=[
|
||||
VerificationSample(outcome="success", count=5),
|
||||
],
|
||||
knowledge_entries_total=2161,
|
||||
knowledge_entries_created_24h=25,
|
||||
high_confidence_total=9,
|
||||
high_confidence_success_total=7,
|
||||
emitted_at=1_778_756_000,
|
||||
@@ -36,8 +47,14 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
|
||||
'automation_operation_log_total{outcome="auto_executed",'
|
||||
'operation_type="playbook_executed"} 8'
|
||||
) in rendered
|
||||
assert (
|
||||
'automation_operation_created_24h{outcome="auto_executed",'
|
||||
'operation_type="auto_repair_executed"} 3'
|
||||
) in rendered
|
||||
assert 'post_execution_verification_total{outcome="success"} 7' in rendered
|
||||
assert 'post_execution_verification_created_24h{outcome="success"} 5' in rendered
|
||||
assert "knowledge_entries_total 2161" in rendered
|
||||
assert "knowledge_entries_created_24h 25" in rendered
|
||||
assert "approval_records_high_confidence_total 9" in rendered
|
||||
assert "approval_records_high_confidence_success_total 7" in rendered
|
||||
assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
|
||||
@@ -49,8 +66,11 @@ def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
|
||||
)
|
||||
|
||||
assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered
|
||||
assert 'automation_operation_created_24h{outcome="none",operation_type="none"} 0' in rendered
|
||||
assert 'post_execution_verification_total{outcome="none"} 0' in rendered
|
||||
assert 'post_execution_verification_created_24h{outcome="none"} 0' in rendered
|
||||
assert "knowledge_entries_total 0" in rendered
|
||||
assert "knowledge_entries_created_24h 0" in rendered
|
||||
|
||||
|
||||
def test_render_adr100_slo_metrics_escapes_labels() -> None:
|
||||
|
||||
@@ -654,6 +654,7 @@ class _FakePrometheusResponse:
|
||||
class _FakePrometheusClient:
|
||||
def __init__(self, value: str) -> None:
|
||||
self._value = value
|
||||
self.queries: list[str] = []
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
@@ -662,6 +663,7 @@ class _FakePrometheusClient:
|
||||
return False
|
||||
|
||||
async def get(self, *args, **kwargs): # noqa: ANN002, ANN003
|
||||
self.queries.append(str(kwargs.get("params", {}).get("query", "")))
|
||||
return _FakePrometheusResponse(self._value)
|
||||
|
||||
|
||||
@@ -683,3 +685,16 @@ class TestCheckSloCompliance:
|
||||
assert result[name]["status"] == "skipped"
|
||||
assert result[name]["reason"] == "prometheus_nan_or_inf"
|
||||
assert result["_meta"]["status"] == "no_data"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_km_growth_prefers_db_derived_24h_gauge(self):
|
||||
"""KM SLO 要優先使用 DB 24h gauge,避免新 counter 暖機時誤報 0."""
|
||||
agent = _make_agent()
|
||||
client = _FakePrometheusClient("25")
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=client):
|
||||
result = await agent.check_slo_compliance()
|
||||
|
||||
assert "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h" in client.queries
|
||||
assert result["km_growth_rate"]["status"] == "ok"
|
||||
assert result["km_growth_rate"]["value"] == 25
|
||||
|
||||
@@ -110,11 +110,18 @@ sum(rate(approval_records_high_confidence_total[1h]))
|
||||
|
||||
**SLI 計算式**:
|
||||
```promql
|
||||
max(knowledge_entries_created_24h)
|
||||
or
|
||||
increase(knowledge_entries_total[24h])
|
||||
```
|
||||
|
||||
**Recording rule**: `sli:km_growth_rate:24h`
|
||||
|
||||
**資料來源備註(2026-05-14 T19)**:`knowledge_entries_created_24h`
|
||||
是 API `/metrics` 直接從 PostgreSQL `knowledge_entries.created_at >= now()-24h`
|
||||
產出的 gauge。`increase(knowledge_entries_total[24h])` 只作舊 counter fallback,
|
||||
避免 emitter 新上線時 Prometheus 還沒有 24h counter history 而誤報 KM 增長為 0。
|
||||
|
||||
**目標值(SLO)**: ≥ 20 筆/day
|
||||
|
||||
**Error budget**:不適用標準 burn rate(絕對值 SLO),改用閾值告警
|
||||
@@ -158,7 +165,7 @@ increase(knowledge_entries_total[24h])
|
||||
| `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 |
|
||||
| `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard |
|
||||
| `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 |
|
||||
| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18:從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series;`automation_operation_log_total` 僅納入 remediation / PlayBook / auto-repair 範圍,背景治理工作不進 AI 自動修復 SLO 分母 |
|
||||
| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18:從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series;`automation_operation_log_total` 僅納入 remediation / PlayBook / auto-repair 範圍,背景治理工作不進 AI 自動修復 SLO 分母。2026-05-14 T19:追加 `*_created_24h` gauges,供治理 Agent / 前端直接顯示最近 24h 事實量,避免 counter 暖機造成 false red |
|
||||
| `apps/api/src/main.py` `/metrics` | 2026-05-14 T18:追加 DB-derived SLO emitter,讓既有 `awoooi-api` scrape job 取得底層 series |
|
||||
|
||||
## 決策理由
|
||||
|
||||
@@ -45,9 +45,12 @@ groups:
|
||||
/
|
||||
sum(rate(approval_records_high_confidence_total[1h]))
|
||||
|
||||
# SLO 4: KM 增長率 = 24h increase (絕對值,不做 rate)
|
||||
# SLO 4: KM 增長率 = DB-derived 24h gauge;fallback 給舊 counter history
|
||||
- record: sli:km_growth_rate:24h
|
||||
expr: increase(knowledge_entries_total[24h])
|
||||
expr: |
|
||||
max(knowledge_entries_created_24h)
|
||||
or
|
||||
increase(knowledge_entries_total[24h])
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Error Budget Recording Rules(輔助 Grafana 顯示)
|
||||
@@ -248,7 +251,7 @@ groups:
|
||||
summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂"
|
||||
description: "過去 24h KM 新增 {{ $value }} 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。"
|
||||
runbook: |
|
||||
1. 確認 knowledge_entries_total counter 是否正常遞增
|
||||
1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
|
||||
2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
|
||||
3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md)
|
||||
4. 手動執行 POST /api/v1/governance/check
|
||||
|
||||
@@ -35,7 +35,8 @@ tests:
|
||||
- expr: sli:autonomy_rate:5m
|
||||
eval_time: 15m
|
||||
exp_samples:
|
||||
- value: 0.8
|
||||
- labels: '{__name__="sli:autonomy_rate:5m"}'
|
||||
value: 0.8
|
||||
|
||||
# ---- SLI 1: 自主化率 = 100%(無 human_required)----
|
||||
- interval: 1m
|
||||
@@ -47,7 +48,8 @@ tests:
|
||||
- expr: sli:autonomy_rate:5m
|
||||
eval_time: 15m
|
||||
exp_samples:
|
||||
- value: 1.0
|
||||
- labels: '{__name__="sli:autonomy_rate:5m"}'
|
||||
value: 1.0
|
||||
|
||||
# ---- SLI 2: 決策準確率 = 90% (success=9, auto_executed=10) ----
|
||||
- interval: 1m
|
||||
@@ -61,7 +63,8 @@ tests:
|
||||
- expr: sli:decision_accuracy:5m
|
||||
eval_time: 15m
|
||||
exp_samples:
|
||||
- value: 0.9
|
||||
- labels: '{__name__="sli:decision_accuracy:5m"}'
|
||||
value: 0.9
|
||||
|
||||
# ---- SLI 4: KM 增長率(24h increase)----
|
||||
- interval: 1m
|
||||
@@ -74,7 +77,23 @@ tests:
|
||||
eval_time: 25h
|
||||
exp_samples:
|
||||
# increase over 24h = 1440 samples × 1/min
|
||||
- value: 1440
|
||||
- labels: '{__name__="sli:km_growth_rate:24h"}'
|
||||
value: 1440
|
||||
|
||||
# ---- SLI 4: DB-derived gauge 優先,避免 counter 新上線暖機誤報 0 ----
|
||||
- interval: 1m
|
||||
name: "sli:km_growth_rate:24h 應優先使用 knowledge_entries_created_24h"
|
||||
input_series:
|
||||
- series: "knowledge_entries_created_24h"
|
||||
values: "25x30"
|
||||
- series: "knowledge_entries_total"
|
||||
values: "100x30"
|
||||
promql_expr_test:
|
||||
- expr: sli:km_growth_rate:24h
|
||||
eval_time: 15m
|
||||
exp_samples:
|
||||
- labels: '{__name__="sli:km_growth_rate:24h"}'
|
||||
value: 25
|
||||
|
||||
# ============================================================
|
||||
# Alert Tests — SLO 1: 自主化率
|
||||
@@ -120,6 +139,10 @@ tests:
|
||||
burn_window: 3d
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
exp_annotations:
|
||||
summary: "SLO 自主化率 slow burn(長期趨勢偏低)"
|
||||
description: "自主化率長期低於目標,累積 error budget 消耗率偏高,建議本週 review。"
|
||||
runbook: "分析近 7d 數據,是否需要重訓或調整 confidence threshold。"
|
||||
|
||||
# ---- 負測: 自主化率 = 85% → SlowBurn 不觸發 ----
|
||||
- interval: 1m
|
||||
@@ -157,6 +180,10 @@ tests:
|
||||
burn_window: 3d
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
exp_annotations:
|
||||
summary: "SLO 決策準確率 slow burn(長期趨勢偏低)"
|
||||
description: "決策準確率長期低於目標,累積 error budget 消耗偏高。"
|
||||
runbook: "近 7d verifier 失敗分析,考慮 playbook fine-tune。"
|
||||
|
||||
# ---- 負測: 決策準確率 = 92% → SlowBurn 不觸發 ----
|
||||
- interval: 1m
|
||||
@@ -192,6 +219,14 @@ tests:
|
||||
slo_name: km_growth_rate
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
exp_annotations:
|
||||
summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂"
|
||||
description: "過去 24h KM 新增 0 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。"
|
||||
runbook: |
|
||||
1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
|
||||
2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
|
||||
3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md)
|
||||
4. 手動執行 POST /api/v1/governance/check
|
||||
|
||||
# ---- 正測: KM 增長率 = 3/day → Critical 觸發(< 5)----
|
||||
- interval: 30m
|
||||
@@ -210,6 +245,14 @@ tests:
|
||||
slo_name: km_growth_rate
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
exp_annotations:
|
||||
summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂"
|
||||
description: "過去 24h KM 新增 2.9999999999999996 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。"
|
||||
runbook: |
|
||||
1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
|
||||
2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
|
||||
3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md)
|
||||
4. 手動執行 POST /api/v1/governance/check
|
||||
|
||||
# ---- 負測: KM 增長率 = 30/day → Critical 不觸發 ----
|
||||
- interval: 1m
|
||||
@@ -240,3 +283,7 @@ tests:
|
||||
slo_name: km_growth_rate
|
||||
team: ai
|
||||
auto_repair: "false"
|
||||
exp_annotations:
|
||||
summary: "SLO KM 增長率偏低(< 20 筆/day)"
|
||||
description: "過去 24h KM 新增 14.976000000000393 筆,低於目標 20 筆/day。"
|
||||
runbook: "查 KM 寫入路徑(auto_execute 後 _write_execution_result_to_km),確認飛輪 KM 閉環正常。"
|
||||
|
||||
@@ -6,45 +6,57 @@
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
RULES_FILE="ops/monitoring/alerts-unified.yml"
|
||||
ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml"
|
||||
SLO_RULES_FILE="ops/monitoring/slo-rules.yml"
|
||||
TARGET_HOST="192.168.0.110"
|
||||
TARGET_PATH="/home/wooo/monitoring/alerts.yml"
|
||||
TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml"
|
||||
TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml"
|
||||
PROMETHEUS_URL="http://${TARGET_HOST}:9090"
|
||||
DRY_RUN="${1:-}"
|
||||
|
||||
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
||||
|
||||
# 確認檔案存在
|
||||
if [ ! -f "$RULES_FILE" ]; then
|
||||
echo "ERROR: $RULES_FILE not found"
|
||||
exit 1
|
||||
fi
|
||||
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
|
||||
if [ ! -f "$file" ]; then
|
||||
echo "ERROR: $file not found"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# 驗證 YAML 語法
|
||||
if python3 -c "import yaml; yaml.safe_load(open('$RULES_FILE'))" 2>/dev/null; then
|
||||
:
|
||||
elif ruby -e "require 'yaml'; YAML.load_file('$RULES_FILE')" 2>/dev/null; then
|
||||
:
|
||||
else
|
||||
echo "ERROR: YAML syntax error or no YAML parser available"
|
||||
exit 1
|
||||
fi
|
||||
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
|
||||
if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then
|
||||
:
|
||||
elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then
|
||||
:
|
||||
else
|
||||
echo "ERROR: YAML syntax error or no YAML parser available: $file"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
log "✅ YAML 語法驗證通過"
|
||||
|
||||
# Dry run 模式
|
||||
if [ "$DRY_RUN" = "--dry-run" ]; then
|
||||
log "DRY RUN: would deploy $RULES_FILE to ${TARGET_HOST}:${TARGET_PATH}"
|
||||
RULE_COUNT=$(grep -c "alert:" "$RULES_FILE")
|
||||
log "規則數量: $RULE_COUNT 條"
|
||||
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}"
|
||||
log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}"
|
||||
ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE")
|
||||
SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE")
|
||||
SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE")
|
||||
log "告警規則數量: $ALERT_COUNT 條;SLO recording: $SLO_RECORD_COUNT 條;SLO alerts: $SLO_ALERT_COUNT 條"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 備份現有規則
|
||||
ssh wooo@${TARGET_HOST} "cp ${TARGET_PATH} ${TARGET_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
|
||||
ssh wooo@${TARGET_HOST} "\
|
||||
cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
|
||||
cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
|
||||
log "✅ 現有規則已備份"
|
||||
|
||||
# 部署新規則
|
||||
scp "$RULES_FILE" wooo@${TARGET_HOST}:${TARGET_PATH}
|
||||
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH}
|
||||
scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH}
|
||||
log "✅ 規則已複製到 ${TARGET_HOST}"
|
||||
|
||||
# Reload Prometheus
|
||||
@@ -72,4 +84,15 @@ for rule in "${KEY_RULES[@]}"; do
|
||||
fi
|
||||
done
|
||||
|
||||
KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical")
|
||||
for rule in "${KEY_SLO_RULES[@]}"; do
|
||||
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
|
||||
if [ "$EXISTS" = "OK" ]; then
|
||||
log "✅ $rule"
|
||||
else
|
||||
echo "❌ $rule 未找到"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
log "🎉 部署完成!所有關鍵規則已生效"
|
||||
|
||||
Reference in New Issue
Block a user