From d2a4a179691434cda939ccf992cedea115489a2f Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 14 May 2026 19:33:52 +0800 Subject: [PATCH] fix(governance): stabilize adr100 km growth slo --- .gitea/workflows/deploy-alerts.yaml | 5 +- .../services/adr100_slo_metrics_service.py | 119 ++++++++++++++++++ apps/api/src/services/governance_agent.py | 4 +- .../tests/test_adr100_slo_metrics_service.py | 20 +++ apps/api/tests/test_governance_agent.py | 15 +++ docs/adr/ADR-100-ai-autonomous-slo.md | 9 +- ops/monitoring/slo-rules.yml | 9 +- ops/monitoring/tests/test_slo_rules.yaml | 55 +++++++- scripts/ops/deploy-alerts.sh | 61 ++++++--- 9 files changed, 267 insertions(+), 30 deletions(-) diff --git a/.gitea/workflows/deploy-alerts.yaml b/.gitea/workflows/deploy-alerts.yaml index 11af3273..70f37f34 100644 --- a/.gitea/workflows/deploy-alerts.yaml +++ b/.gitea/workflows/deploy-alerts.yaml @@ -1,7 +1,7 @@ # ============================================================================= # Deploy Prometheus Alert Rules (獨立 workflow) # 2026-04-05 Claude Code (ADR-039 I3): 從 cd.yaml 分離 -# 觸發條件: ops/monitoring/alerts-unified.yml 有變更 或 workflow_dispatch +# 觸發條件: ops/monitoring/alerts-unified.yml / slo-rules.yml 有變更 或 workflow_dispatch # 說明: 告警規則部署不依賴應用構建,獨立觸發以加快響應速度 # ============================================================================= @@ -12,6 +12,8 @@ on: branches: [main] paths: - 'ops/monitoring/alerts-unified.yml' + - 'ops/monitoring/slo-rules.yml' + - 'scripts/ops/deploy-alerts.sh' workflow_dispatch: env: @@ -30,6 +32,7 @@ jobs: run: | pip3 install -q pyyaml 2>/dev/null || pip install -q pyyaml python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/alerts-unified.yml')); print('YAML OK')" + python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/slo-rules.yml')); print('SLO YAML OK')" - name: Setup SSH key run: | diff --git a/apps/api/src/services/adr100_slo_metrics_service.py b/apps/api/src/services/adr100_slo_metrics_service.py index 0221ebe5..801a7340 100644 --- a/apps/api/src/services/adr100_slo_metrics_service.py +++ b/apps/api/src/services/adr100_slo_metrics_service.py @@ -33,8 +33,11 @@ class VerificationSample: @dataclass(frozen=True) class Adr100SloMetricsSnapshot: automation_operations: list[AutomationOperationSample] = field(default_factory=list) + automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list) post_execution_verifications: list[VerificationSample] = field(default_factory=list) + post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list) knowledge_entries_total: int = 0 + knowledge_entries_created_24h: int = 0 high_confidence_total: int = 0 high_confidence_success_total: int = 0 emitted_at: float = field(default_factory=time) @@ -52,13 +55,33 @@ class Adr100SloMetricsService: automation_rows = ( await db.execute(text(_AUTOMATION_OPERATION_SQL)) ).fetchall() + automation_24h_rows = ( + await db.execute(text(_AUTOMATION_OPERATION_24H_SQL)) + ).fetchall() verification_rows = ( await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL)) ).fetchall() + verification_24h_rows = ( + await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL)) + ).fetchall() knowledge_total = int( (await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar() or 0 ) + knowledge_created_24h = int( + ( + await db.execute( + text( + """ + SELECT count(*) + FROM knowledge_entries + WHERE created_at >= NOW() - INTERVAL '24 hours' + """ + ) + ) + ).scalar() + or 0 + ) confidence_row = ( await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL)) ).one() @@ -72,6 +95,14 @@ class Adr100SloMetricsService: ) for row in automation_rows ], + automation_operations_24h=[ + AutomationOperationSample( + outcome=str(row.outcome), + operation_type=str(row.operation_type), + count=int(row.count or 0), + ) + for row in automation_24h_rows + ], post_execution_verifications=[ VerificationSample( outcome=str(row.outcome), @@ -79,7 +110,15 @@ class Adr100SloMetricsService: ) for row in verification_rows ], + post_execution_verifications_24h=[ + VerificationSample( + outcome=str(row.outcome), + count=int(row.count or 0), + ) + for row in verification_24h_rows + ], knowledge_entries_total=knowledge_total, + knowledge_entries_created_24h=knowledge_created_24h, high_confidence_total=int(confidence_row.high_confidence_total or 0), high_confidence_success_total=int( confidence_row.high_confidence_success_total or 0 @@ -107,6 +146,23 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str: 'automation_operation_log_total{outcome="none",operation_type="none"} 0' ) + lines.extend([ + "# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards", + "# TYPE automation_operation_created_24h gauge", + ]) + if snapshot.automation_operations_24h: + for sample in snapshot.automation_operations_24h: + lines.append( + "automation_operation_created_24h" + f'{{outcome="{_escape_label(sample.outcome)}",' + f'operation_type="{_escape_label(sample.operation_type)}"}} ' + f"{sample.count}" + ) + else: + lines.append( + 'automation_operation_created_24h{outcome="none",operation_type="none"} 0' + ) + lines.extend([ "# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs", "# TYPE post_execution_verification_total counter", @@ -120,10 +176,26 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str: else: lines.append('post_execution_verification_total{outcome="none"} 0') + lines.extend([ + "# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards", + "# TYPE post_execution_verification_created_24h gauge", + ]) + if snapshot.post_execution_verifications_24h: + for sample in snapshot.post_execution_verifications_24h: + lines.append( + "post_execution_verification_created_24h" + f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}' + ) + else: + lines.append('post_execution_verification_created_24h{outcome="none"} 0') + lines.extend([ "# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs", "# TYPE knowledge_entries_total counter", f"knowledge_entries_total {snapshot.knowledge_entries_total}", + "# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs", + "# TYPE knowledge_entries_created_24h gauge", + f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}", "# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs", "# TYPE approval_records_high_confidence_total counter", f"approval_records_high_confidence_total {snapshot.high_confidence_total}", @@ -180,6 +252,43 @@ _AUTOMATION_OPERATION_SQL = """ """ +_AUTOMATION_OPERATION_24H_SQL = """ + WITH automation_scope AS ( + SELECT + CASE + WHEN status <> 'success' THEN status + WHEN actor = 'approval_execution' + AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%' + THEN 'human_required' + ELSE 'auto_executed' + END AS outcome, + operation_type + FROM automation_operation_log + WHERE operation_type IN ( + 'playbook_executed', + 'remediation_executed', + 'remediation_verified', + 'remediation_rolled_back', + 'self_correction_attempted' + ) + AND created_at >= NOW() - INTERVAL '24 hours' + UNION ALL + SELECT + CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome, + 'auto_repair_executed' AS operation_type + FROM auto_repair_executions + WHERE created_at >= NOW() - INTERVAL '24 hours' + ) + SELECT + outcome, + operation_type, + count(*) AS count + FROM automation_scope + GROUP BY outcome, operation_type + ORDER BY outcome, operation_type +""" + + _POST_EXECUTION_VERIFICATION_SQL = """ SELECT verification_result AS outcome, count(*) AS count FROM incident_evidence @@ -189,6 +298,16 @@ _POST_EXECUTION_VERIFICATION_SQL = """ """ +_POST_EXECUTION_VERIFICATION_24H_SQL = """ + SELECT verification_result AS outcome, count(*) AS count + FROM incident_evidence + WHERE verification_result IS NOT NULL + AND collected_at >= NOW() - INTERVAL '24 hours' + GROUP BY verification_result + ORDER BY verification_result +""" + + _HIGH_CONFIDENCE_APPROVAL_SQL = """ WITH approval_confidence AS ( SELECT diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py index 9e2596bf..1e7371c9 100644 --- a/apps/api/src/services/governance_agent.py +++ b/apps/api/src/services/governance_agent.py @@ -394,7 +394,7 @@ class GovernanceAgent: SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70 SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85 SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70 - SLO 4 KM 增長率: sli:km_growth_rate:24h 硬紅線 < 5 + SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5 2026-04-27 P3.4 by Claude — AI SLO(ADR-100) """ @@ -409,7 +409,7 @@ class GovernanceAgent: "autonomy_rate": "sli:autonomy_rate:5m", "decision_accuracy": "sli:decision_accuracy:5m", "confidence_calibration": "sli:confidence_calibration:1h", - "km_growth_rate": "sli:km_growth_rate:24h", + "km_growth_rate": "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h", } # 硬紅線:低於此值必須告警(非軟性警告) hard_red_lines: dict[str, float] = { diff --git a/apps/api/tests/test_adr100_slo_metrics_service.py b/apps/api/tests/test_adr100_slo_metrics_service.py index 5fa970cf..8fbb4eb4 100644 --- a/apps/api/tests/test_adr100_slo_metrics_service.py +++ b/apps/api/tests/test_adr100_slo_metrics_service.py @@ -20,11 +20,22 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None: count=2, ), ], + automation_operations_24h=[ + AutomationOperationSample( + outcome="auto_executed", + operation_type="auto_repair_executed", + count=3, + ), + ], post_execution_verifications=[ VerificationSample(outcome="success", count=7), VerificationSample(outcome="failed", count=1), ], + post_execution_verifications_24h=[ + VerificationSample(outcome="success", count=5), + ], knowledge_entries_total=2161, + knowledge_entries_created_24h=25, high_confidence_total=9, high_confidence_success_total=7, emitted_at=1_778_756_000, @@ -36,8 +47,14 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None: 'automation_operation_log_total{outcome="auto_executed",' 'operation_type="playbook_executed"} 8' ) in rendered + assert ( + 'automation_operation_created_24h{outcome="auto_executed",' + 'operation_type="auto_repair_executed"} 3' + ) in rendered assert 'post_execution_verification_total{outcome="success"} 7' in rendered + assert 'post_execution_verification_created_24h{outcome="success"} 5' in rendered assert "knowledge_entries_total 2161" in rendered + assert "knowledge_entries_created_24h 25" in rendered assert "approval_records_high_confidence_total 9" in rendered assert "approval_records_high_confidence_success_total 7" in rendered assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered @@ -49,8 +66,11 @@ def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None: ) assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered + assert 'automation_operation_created_24h{outcome="none",operation_type="none"} 0' in rendered assert 'post_execution_verification_total{outcome="none"} 0' in rendered + assert 'post_execution_verification_created_24h{outcome="none"} 0' in rendered assert "knowledge_entries_total 0" in rendered + assert "knowledge_entries_created_24h 0" in rendered def test_render_adr100_slo_metrics_escapes_labels() -> None: diff --git a/apps/api/tests/test_governance_agent.py b/apps/api/tests/test_governance_agent.py index d345a402..7d9593a6 100644 --- a/apps/api/tests/test_governance_agent.py +++ b/apps/api/tests/test_governance_agent.py @@ -654,6 +654,7 @@ class _FakePrometheusResponse: class _FakePrometheusClient: def __init__(self, value: str) -> None: self._value = value + self.queries: list[str] = [] async def __aenter__(self): return self @@ -662,6 +663,7 @@ class _FakePrometheusClient: return False async def get(self, *args, **kwargs): # noqa: ANN002, ANN003 + self.queries.append(str(kwargs.get("params", {}).get("query", ""))) return _FakePrometheusResponse(self._value) @@ -683,3 +685,16 @@ class TestCheckSloCompliance: assert result[name]["status"] == "skipped" assert result[name]["reason"] == "prometheus_nan_or_inf" assert result["_meta"]["status"] == "no_data" + + @pytest.mark.asyncio + async def test_km_growth_prefers_db_derived_24h_gauge(self): + """KM SLO 要優先使用 DB 24h gauge,避免新 counter 暖機時誤報 0.""" + agent = _make_agent() + client = _FakePrometheusClient("25") + + with patch("httpx.AsyncClient", return_value=client): + result = await agent.check_slo_compliance() + + assert "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h" in client.queries + assert result["km_growth_rate"]["status"] == "ok" + assert result["km_growth_rate"]["value"] == 25 diff --git a/docs/adr/ADR-100-ai-autonomous-slo.md b/docs/adr/ADR-100-ai-autonomous-slo.md index 26bfe6bf..f3453bea 100644 --- a/docs/adr/ADR-100-ai-autonomous-slo.md +++ b/docs/adr/ADR-100-ai-autonomous-slo.md @@ -110,11 +110,18 @@ sum(rate(approval_records_high_confidence_total[1h])) **SLI 計算式**: ```promql +max(knowledge_entries_created_24h) +or increase(knowledge_entries_total[24h]) ``` **Recording rule**: `sli:km_growth_rate:24h` +**資料來源備註(2026-05-14 T19)**:`knowledge_entries_created_24h` +是 API `/metrics` 直接從 PostgreSQL `knowledge_entries.created_at >= now()-24h` +產出的 gauge。`increase(knowledge_entries_total[24h])` 只作舊 counter fallback, +避免 emitter 新上線時 Prometheus 還沒有 24h counter history 而誤報 KM 增長為 0。 + **目標值(SLO)**: ≥ 20 筆/day **Error budget**:不適用標準 burn rate(絕對值 SLO),改用閾值告警 @@ -158,7 +165,7 @@ increase(knowledge_entries_total[24h]) | `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 | | `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard | | `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 | -| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18:從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series;`automation_operation_log_total` 僅納入 remediation / PlayBook / auto-repair 範圍,背景治理工作不進 AI 自動修復 SLO 分母 | +| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18:從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series;`automation_operation_log_total` 僅納入 remediation / PlayBook / auto-repair 範圍,背景治理工作不進 AI 自動修復 SLO 分母。2026-05-14 T19:追加 `*_created_24h` gauges,供治理 Agent / 前端直接顯示最近 24h 事實量,避免 counter 暖機造成 false red | | `apps/api/src/main.py` `/metrics` | 2026-05-14 T18:追加 DB-derived SLO emitter,讓既有 `awoooi-api` scrape job 取得底層 series | ## 決策理由 diff --git a/ops/monitoring/slo-rules.yml b/ops/monitoring/slo-rules.yml index 4f0bd243..115bffbe 100644 --- a/ops/monitoring/slo-rules.yml +++ b/ops/monitoring/slo-rules.yml @@ -45,9 +45,12 @@ groups: / sum(rate(approval_records_high_confidence_total[1h])) - # SLO 4: KM 增長率 = 24h increase (絕對值,不做 rate) + # SLO 4: KM 增長率 = DB-derived 24h gauge;fallback 給舊 counter history - record: sli:km_growth_rate:24h - expr: increase(knowledge_entries_total[24h]) + expr: | + max(knowledge_entries_created_24h) + or + increase(knowledge_entries_total[24h]) # ----------------------------------------------------------------------- # Error Budget Recording Rules(輔助 Grafana 顯示) @@ -248,7 +251,7 @@ groups: summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂" description: "過去 24h KM 新增 {{ $value }} 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。" runbook: | - 1. 確認 knowledge_entries_total counter 是否正常遞增 + 1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增 2. 查 governance_agent 日誌中 governance_km_growth_slo_violation 3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md) 4. 手動執行 POST /api/v1/governance/check diff --git a/ops/monitoring/tests/test_slo_rules.yaml b/ops/monitoring/tests/test_slo_rules.yaml index 3d872d3d..a935f38e 100644 --- a/ops/monitoring/tests/test_slo_rules.yaml +++ b/ops/monitoring/tests/test_slo_rules.yaml @@ -35,7 +35,8 @@ tests: - expr: sli:autonomy_rate:5m eval_time: 15m exp_samples: - - value: 0.8 + - labels: '{__name__="sli:autonomy_rate:5m"}' + value: 0.8 # ---- SLI 1: 自主化率 = 100%(無 human_required)---- - interval: 1m @@ -47,7 +48,8 @@ tests: - expr: sli:autonomy_rate:5m eval_time: 15m exp_samples: - - value: 1.0 + - labels: '{__name__="sli:autonomy_rate:5m"}' + value: 1.0 # ---- SLI 2: 決策準確率 = 90% (success=9, auto_executed=10) ---- - interval: 1m @@ -61,7 +63,8 @@ tests: - expr: sli:decision_accuracy:5m eval_time: 15m exp_samples: - - value: 0.9 + - labels: '{__name__="sli:decision_accuracy:5m"}' + value: 0.9 # ---- SLI 4: KM 增長率(24h increase)---- - interval: 1m @@ -74,7 +77,23 @@ tests: eval_time: 25h exp_samples: # increase over 24h = 1440 samples × 1/min - - value: 1440 + - labels: '{__name__="sli:km_growth_rate:24h"}' + value: 1440 + + # ---- SLI 4: DB-derived gauge 優先,避免 counter 新上線暖機誤報 0 ---- + - interval: 1m + name: "sli:km_growth_rate:24h 應優先使用 knowledge_entries_created_24h" + input_series: + - series: "knowledge_entries_created_24h" + values: "25x30" + - series: "knowledge_entries_total" + values: "100x30" + promql_expr_test: + - expr: sli:km_growth_rate:24h + eval_time: 15m + exp_samples: + - labels: '{__name__="sli:km_growth_rate:24h"}' + value: 25 # ============================================================ # Alert Tests — SLO 1: 自主化率 @@ -120,6 +139,10 @@ tests: burn_window: 3d team: ai auto_repair: "false" + exp_annotations: + summary: "SLO 自主化率 slow burn(長期趨勢偏低)" + description: "自主化率長期低於目標,累積 error budget 消耗率偏高,建議本週 review。" + runbook: "分析近 7d 數據,是否需要重訓或調整 confidence threshold。" # ---- 負測: 自主化率 = 85% → SlowBurn 不觸發 ---- - interval: 1m @@ -157,6 +180,10 @@ tests: burn_window: 3d team: ai auto_repair: "false" + exp_annotations: + summary: "SLO 決策準確率 slow burn(長期趨勢偏低)" + description: "決策準確率長期低於目標,累積 error budget 消耗偏高。" + runbook: "近 7d verifier 失敗分析,考慮 playbook fine-tune。" # ---- 負測: 決策準確率 = 92% → SlowBurn 不觸發 ---- - interval: 1m @@ -192,6 +219,14 @@ tests: slo_name: km_growth_rate team: ai auto_repair: "false" + exp_annotations: + summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂" + description: "過去 24h KM 新增 0 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。" + runbook: | + 1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增 + 2. 查 governance_agent 日誌中 governance_km_growth_slo_violation + 3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md) + 4. 手動執行 POST /api/v1/governance/check # ---- 正測: KM 增長率 = 3/day → Critical 觸發(< 5)---- - interval: 30m @@ -210,6 +245,14 @@ tests: slo_name: km_growth_rate team: ai auto_repair: "false" + exp_annotations: + summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂" + description: "過去 24h KM 新增 2.9999999999999996 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。" + runbook: | + 1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增 + 2. 查 governance_agent 日誌中 governance_km_growth_slo_violation + 3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md) + 4. 手動執行 POST /api/v1/governance/check # ---- 負測: KM 增長率 = 30/day → Critical 不觸發 ---- - interval: 1m @@ -240,3 +283,7 @@ tests: slo_name: km_growth_rate team: ai auto_repair: "false" + exp_annotations: + summary: "SLO KM 增長率偏低(< 20 筆/day)" + description: "過去 24h KM 新增 14.976000000000393 筆,低於目標 20 筆/day。" + runbook: "查 KM 寫入路徑(auto_execute 後 _write_execution_result_to_km),確認飛輪 KM 閉環正常。" diff --git a/scripts/ops/deploy-alerts.sh b/scripts/ops/deploy-alerts.sh index f35a8fd3..dffff073 100755 --- a/scripts/ops/deploy-alerts.sh +++ b/scripts/ops/deploy-alerts.sh @@ -6,45 +6,57 @@ set -eo pipefail -RULES_FILE="ops/monitoring/alerts-unified.yml" +ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml" +SLO_RULES_FILE="ops/monitoring/slo-rules.yml" TARGET_HOST="192.168.0.110" -TARGET_PATH="/home/wooo/monitoring/alerts.yml" +TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml" +TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml" PROMETHEUS_URL="http://${TARGET_HOST}:9090" DRY_RUN="${1:-}" log() { echo "[$(date '+%H:%M:%S')] $*"; } # 確認檔案存在 -if [ ! -f "$RULES_FILE" ]; then - echo "ERROR: $RULES_FILE not found" - exit 1 -fi +for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do + if [ ! -f "$file" ]; then + echo "ERROR: $file not found" + exit 1 + fi +done # 驗證 YAML 語法 -if python3 -c "import yaml; yaml.safe_load(open('$RULES_FILE'))" 2>/dev/null; then - : -elif ruby -e "require 'yaml'; YAML.load_file('$RULES_FILE')" 2>/dev/null; then - : -else - echo "ERROR: YAML syntax error or no YAML parser available" - exit 1 -fi +for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do + if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then + : + elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then + : + else + echo "ERROR: YAML syntax error or no YAML parser available: $file" + exit 1 + fi +done log "✅ YAML 語法驗證通過" # Dry run 模式 if [ "$DRY_RUN" = "--dry-run" ]; then - log "DRY RUN: would deploy $RULES_FILE to ${TARGET_HOST}:${TARGET_PATH}" - RULE_COUNT=$(grep -c "alert:" "$RULES_FILE") - log "規則數量: $RULE_COUNT 條" + log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}" + log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}" + ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE") + SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE") + SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE") + log "告警規則數量: $ALERT_COUNT 條;SLO recording: $SLO_RECORD_COUNT 條;SLO alerts: $SLO_ALERT_COUNT 條" exit 0 fi # 備份現有規則 -ssh wooo@${TARGET_HOST} "cp ${TARGET_PATH} ${TARGET_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true" +ssh wooo@${TARGET_HOST} "\ + cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ + cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true" log "✅ 現有規則已備份" # 部署新規則 -scp "$RULES_FILE" wooo@${TARGET_HOST}:${TARGET_PATH} +scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH} +scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH} log "✅ 規則已複製到 ${TARGET_HOST}" # Reload Prometheus @@ -72,4 +84,15 @@ for rule in "${KEY_RULES[@]}"; do fi done +KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical") +for rule in "${KEY_SLO_RULES[@]}"; do + EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"") + if [ "$EXISTS" = "OK" ]; then + log "✅ $rule" + else + echo "❌ $rule 未找到" + exit 1 + fi +done + log "🎉 部署完成!所有關鍵規則已生效"