fix(governance): stabilize adr100 km growth slo

2026-05-14 19:33:52 +08:00
parent cdb8bf6802
commit d2a4a17969
9 changed files with 267 additions and 30 deletions
--- a/.gitea/workflows/deploy-alerts.yaml
+++ b/.gitea/workflows/deploy-alerts.yaml
@@ -1,7 +1,7 @@
 # =============================================================================
 # Deploy Prometheus Alert Rules (獨立 workflow)
 # 2026-04-05 Claude Code (ADR-039 I3): 從 cd.yaml 分離
-# 觸發條件: ops/monitoring/alerts-unified.yml 有變更 或 workflow_dispatch
+# 觸發條件: ops/monitoring/alerts-unified.yml / slo-rules.yml 有變更 或 workflow_dispatch
 # 說明: 告警規則部署不依賴應用構建，獨立觸發以加快響應速度
 # =============================================================================

@@ -12,6 +12,8 @@ on:
    branches: [main]
    paths:
      - 'ops/monitoring/alerts-unified.yml'
+      - 'ops/monitoring/slo-rules.yml'
+      - 'scripts/ops/deploy-alerts.sh'
  workflow_dispatch:

 env:
@@ -30,6 +32,7 @@ jobs:
        run: |
          pip3 install -q pyyaml 2>/dev/null || pip install -q pyyaml
          python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/alerts-unified.yml')); print('YAML OK')"
+          python3 -c "import yaml; yaml.safe_load(open('ops/monitoring/slo-rules.yml')); print('SLO YAML OK')"

      - name: Setup SSH key
        run: |
--- a/apps/api/src/services/adr100_slo_metrics_service.py
+++ b/apps/api/src/services/adr100_slo_metrics_service.py
@@ -33,8 +33,11 @@ class VerificationSample:
@dataclass(frozen=True)
 class Adr100SloMetricsSnapshot:
    automation_operations: list[AutomationOperationSample] = field(default_factory=list)
+    automation_operations_24h: list[AutomationOperationSample] = field(default_factory=list)
    post_execution_verifications: list[VerificationSample] = field(default_factory=list)
+    post_execution_verifications_24h: list[VerificationSample] = field(default_factory=list)
    knowledge_entries_total: int = 0
+    knowledge_entries_created_24h: int = 0
    high_confidence_total: int = 0
    high_confidence_success_total: int = 0
    emitted_at: float = field(default_factory=time)
@@ -52,13 +55,33 @@ class Adr100SloMetricsService:
            automation_rows = (
                await db.execute(text(_AUTOMATION_OPERATION_SQL))
            ).fetchall()
+            automation_24h_rows = (
+                await db.execute(text(_AUTOMATION_OPERATION_24H_SQL))
+            ).fetchall()
            verification_rows = (
                await db.execute(text(_POST_EXECUTION_VERIFICATION_SQL))
            ).fetchall()
+            verification_24h_rows = (
+                await db.execute(text(_POST_EXECUTION_VERIFICATION_24H_SQL))
+            ).fetchall()
            knowledge_total = int(
                (await db.execute(text("SELECT count(*) FROM knowledge_entries"))).scalar()
                or 0
            )
+            knowledge_created_24h = int(
+                (
+                    await db.execute(
+                        text(
+                            """
+                            SELECT count(*)
+                            FROM knowledge_entries
+                            WHERE created_at >= NOW() - INTERVAL '24 hours'
+                            """
+                        )
+                    )
+                ).scalar()
+                or 0
+            )
            confidence_row = (
                await db.execute(text(_HIGH_CONFIDENCE_APPROVAL_SQL))
            ).one()
@@ -72,6 +95,14 @@ class Adr100SloMetricsService:
                )
                for row in automation_rows
            ],
+            automation_operations_24h=[
+                AutomationOperationSample(
+                    outcome=str(row.outcome),
+                    operation_type=str(row.operation_type),
+                    count=int(row.count or 0),
+                )
+                for row in automation_24h_rows
+            ],
            post_execution_verifications=[
                VerificationSample(
                    outcome=str(row.outcome),
@@ -79,7 +110,15 @@ class Adr100SloMetricsService:
                )
                for row in verification_rows
            ],
+            post_execution_verifications_24h=[
+                VerificationSample(
+                    outcome=str(row.outcome),
+                    count=int(row.count or 0),
+                )
+                for row in verification_24h_rows
+            ],
            knowledge_entries_total=knowledge_total,
+            knowledge_entries_created_24h=knowledge_created_24h,
            high_confidence_total=int(confidence_row.high_confidence_total or 0),
            high_confidence_success_total=int(
                confidence_row.high_confidence_success_total or 0
@@ -107,6 +146,23 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
            'automation_operation_log_total{outcome="none",operation_type="none"} 0'
        )

+    lines.extend([
+        "# HELP automation_operation_created_24h DB-derived AI automation operation count created in the last 24 hours for ADR-100 SLO dashboards",
+        "# TYPE automation_operation_created_24h gauge",
+    ])
+    if snapshot.automation_operations_24h:
+        for sample in snapshot.automation_operations_24h:
+            lines.append(
+                "automation_operation_created_24h"
+                f'{{outcome="{_escape_label(sample.outcome)}",'
+                f'operation_type="{_escape_label(sample.operation_type)}"}} '
+                f"{sample.count}"
+            )
+    else:
+        lines.append(
+            'automation_operation_created_24h{outcome="none",operation_type="none"} 0'
+        )
+
    lines.extend([
        "# HELP post_execution_verification_total DB-derived post execution verification result count for ADR-100 SLOs",
        "# TYPE post_execution_verification_total counter",
@@ -120,10 +176,26 @@ def render_adr100_slo_metrics(snapshot: Adr100SloMetricsSnapshot) -> str:
    else:
        lines.append('post_execution_verification_total{outcome="none"} 0')

+    lines.extend([
+        "# HELP post_execution_verification_created_24h DB-derived post execution verification result count created in the last 24 hours for ADR-100 SLO dashboards",
+        "# TYPE post_execution_verification_created_24h gauge",
+    ])
+    if snapshot.post_execution_verifications_24h:
+        for sample in snapshot.post_execution_verifications_24h:
+            lines.append(
+                "post_execution_verification_created_24h"
+                f'{{outcome="{_escape_label(sample.outcome)}"}} {sample.count}'
+            )
+    else:
+        lines.append('post_execution_verification_created_24h{outcome="none"} 0')
+
    lines.extend([
        "# HELP knowledge_entries_total DB-derived knowledge entry count for ADR-100 SLOs",
        "# TYPE knowledge_entries_total counter",
        f"knowledge_entries_total {snapshot.knowledge_entries_total}",
+        "# HELP knowledge_entries_created_24h DB-derived knowledge entries created in the last 24 hours for ADR-100 SLOs",
+        "# TYPE knowledge_entries_created_24h gauge",
+        f"knowledge_entries_created_24h {snapshot.knowledge_entries_created_24h}",
        "# HELP approval_records_high_confidence_total DB-derived high confidence approval decisions for ADR-100 SLOs",
        "# TYPE approval_records_high_confidence_total counter",
        f"approval_records_high_confidence_total {snapshot.high_confidence_total}",
@@ -180,6 +252,43 @@ _AUTOMATION_OPERATION_SQL = """
 """


+_AUTOMATION_OPERATION_24H_SQL = """
+    WITH automation_scope AS (
+        SELECT
+            CASE
+                WHEN status <> 'success' THEN status
+                WHEN actor = 'approval_execution'
+                     AND COALESCE(input->>'requested_by', '') NOT ILIKE 'auto%%'
+                    THEN 'human_required'
+                ELSE 'auto_executed'
+            END AS outcome,
+            operation_type
+        FROM automation_operation_log
+        WHERE operation_type IN (
+            'playbook_executed',
+            'remediation_executed',
+            'remediation_verified',
+            'remediation_rolled_back',
+            'self_correction_attempted'
+        )
+          AND created_at >= NOW() - INTERVAL '24 hours'
+        UNION ALL
+        SELECT
+            CASE WHEN success THEN 'auto_executed' ELSE 'failed' END AS outcome,
+            'auto_repair_executed' AS operation_type
+        FROM auto_repair_executions
+        WHERE created_at >= NOW() - INTERVAL '24 hours'
+    )
+    SELECT
+        outcome,
+        operation_type,
+        count(*) AS count
+    FROM automation_scope
+    GROUP BY outcome, operation_type
+    ORDER BY outcome, operation_type
+"""
+
+
 _POST_EXECUTION_VERIFICATION_SQL = """
    SELECT verification_result AS outcome, count(*) AS count
    FROM incident_evidence
@@ -189,6 +298,16 @@ _POST_EXECUTION_VERIFICATION_SQL = """
 """


+_POST_EXECUTION_VERIFICATION_24H_SQL = """
+    SELECT verification_result AS outcome, count(*) AS count
+    FROM incident_evidence
+    WHERE verification_result IS NOT NULL
+      AND collected_at >= NOW() - INTERVAL '24 hours'
+    GROUP BY verification_result
+    ORDER BY verification_result
+"""
+
+
 _HIGH_CONFIDENCE_APPROVAL_SQL = """
    WITH approval_confidence AS (
        SELECT
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -394,7 +394,7 @@ class GovernanceAgent:
        SLO 1 自主化率:     sli:autonomy_rate:5m      硬紅線 < 0.70
        SLO 2 決策準確率:   sli:decision_accuracy:5m  硬紅線 < 0.85
        SLO 3 信心校準:     sli:confidence_calibration:1h 硬紅線 < 0.70
-        SLO 4 KM 增長率:    sli:km_growth_rate:24h    硬紅線 < 5
+        SLO 4 KM 增長率:    knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5

        2026-04-27 P3.4 by Claude — AI SLO（ADR-100）
        """
@@ -409,7 +409,7 @@ class GovernanceAgent:
            "autonomy_rate": "sli:autonomy_rate:5m",
            "decision_accuracy": "sli:decision_accuracy:5m",
            "confidence_calibration": "sli:confidence_calibration:1h",
-            "km_growth_rate": "sli:km_growth_rate:24h",
+            "km_growth_rate": "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h",
        }
        # 硬紅線：低於此值必須告警（非軟性警告）
        hard_red_lines: dict[str, float] = {
--- a/apps/api/tests/test_adr100_slo_metrics_service.py
+++ b/apps/api/tests/test_adr100_slo_metrics_service.py
@@ -20,11 +20,22 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
                count=2,
            ),
        ],
+        automation_operations_24h=[
+            AutomationOperationSample(
+                outcome="auto_executed",
+                operation_type="auto_repair_executed",
+                count=3,
+            ),
+        ],
        post_execution_verifications=[
            VerificationSample(outcome="success", count=7),
            VerificationSample(outcome="failed", count=1),
        ],
+        post_execution_verifications_24h=[
+            VerificationSample(outcome="success", count=5),
+        ],
        knowledge_entries_total=2161,
+        knowledge_entries_created_24h=25,
        high_confidence_total=9,
        high_confidence_success_total=7,
        emitted_at=1_778_756_000,
@@ -36,8 +47,14 @@ def test_render_adr100_slo_metrics_outputs_required_series() -> None:
        'automation_operation_log_total{outcome="auto_executed",'
        'operation_type="playbook_executed"} 8'
    ) in rendered
+    assert (
+        'automation_operation_created_24h{outcome="auto_executed",'
+        'operation_type="auto_repair_executed"} 3'
+    ) in rendered
    assert 'post_execution_verification_total{outcome="success"} 7' in rendered
+    assert 'post_execution_verification_created_24h{outcome="success"} 5' in rendered
    assert "knowledge_entries_total 2161" in rendered
+    assert "knowledge_entries_created_24h 25" in rendered
    assert "approval_records_high_confidence_total 9" in rendered
    assert "approval_records_high_confidence_success_total 7" in rendered
    assert "adr100_slo_emitter_last_success_timestamp 1778756000" in rendered
@@ -49,8 +66,11 @@ def test_render_adr100_slo_metrics_emits_zero_series_when_empty() -> None:
    )

    assert 'automation_operation_log_total{outcome="none",operation_type="none"} 0' in rendered
+    assert 'automation_operation_created_24h{outcome="none",operation_type="none"} 0' in rendered
    assert 'post_execution_verification_total{outcome="none"} 0' in rendered
+    assert 'post_execution_verification_created_24h{outcome="none"} 0' in rendered
    assert "knowledge_entries_total 0" in rendered
+    assert "knowledge_entries_created_24h 0" in rendered


 def test_render_adr100_slo_metrics_escapes_labels() -> None:
--- a/apps/api/tests/test_governance_agent.py
+++ b/apps/api/tests/test_governance_agent.py
@@ -654,6 +654,7 @@ class _FakePrometheusResponse:
 class _FakePrometheusClient:
    def __init__(self, value: str) -> None:
        self._value = value
+        self.queries: list[str] = []

    async def __aenter__(self):
        return self
@@ -662,6 +663,7 @@ class _FakePrometheusClient:
        return False

    async def get(self, *args, **kwargs):  # noqa: ANN002, ANN003
+        self.queries.append(str(kwargs.get("params", {}).get("query", "")))
        return _FakePrometheusResponse(self._value)


@@ -683,3 +685,16 @@ class TestCheckSloCompliance:
            assert result[name]["status"] == "skipped"
            assert result[name]["reason"] == "prometheus_nan_or_inf"
        assert result["_meta"]["status"] == "no_data"
+
+    @pytest.mark.asyncio
+    async def test_km_growth_prefers_db_derived_24h_gauge(self):
+        """KM SLO 要優先使用 DB 24h gauge，避免新 counter 暖機時誤報 0."""
+        agent = _make_agent()
+        client = _FakePrometheusClient("25")
+
+        with patch("httpx.AsyncClient", return_value=client):
+            result = await agent.check_slo_compliance()
+
+        assert "max(knowledge_entries_created_24h) or sli:km_growth_rate:24h" in client.queries
+        assert result["km_growth_rate"]["status"] == "ok"
+        assert result["km_growth_rate"]["value"] == 25
--- a/docs/adr/ADR-100-ai-autonomous-slo.md
+++ b/docs/adr/ADR-100-ai-autonomous-slo.md
@@ -110,11 +110,18 @@ sum(rate(approval_records_high_confidence_total[1h]))

 **SLI 計算式**:
 ```promql
+max(knowledge_entries_created_24h)
+or
 increase(knowledge_entries_total[24h])
 ```

 **Recording rule**: `sli:km_growth_rate:24h`

+**資料來源備註（2026-05-14 T19）**：`knowledge_entries_created_24h`
+是 API `/metrics` 直接從 PostgreSQL `knowledge_entries.created_at >= now()-24h`
+產出的 gauge。`increase(knowledge_entries_total[24h])` 只作舊 counter fallback，
+避免 emitter 新上線時 Prometheus 還沒有 24h counter history 而誤報 KM 增長為 0。
+
 **目標值（SLO）**: ≥ 20 筆/day

 **Error budget**：不適用標準 burn rate（絕對值 SLO），改用閾值告警
@@ -158,7 +165,7 @@ increase(knowledge_entries_total[24h])
 | `ops/monitoring/tests/test_slo_rules.yaml` | promtool 單元測試 |
 | `ops/monitoring/grafana/dashboards/ai-slo-dashboard.json` | Grafana SLO Dashboard |
 | `apps/api/src/services/governance_agent.py` | `check_slo_compliance()` 整合 |
-| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18：從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series；`automation_operation_log_total` 僅納入 remediation / PlayBook / auto-repair 範圍，背景治理工作不進 AI 自動修復 SLO 分母 |
+| `apps/api/src/services/adr100_slo_metrics_service.py` | 2026-05-14 T18：從 PostgreSQL 事實來源輸出 ADR-100 底層 Prometheus series；`automation_operation_log_total` 僅納入 remediation / PlayBook / auto-repair 範圍，背景治理工作不進 AI 自動修復 SLO 分母。2026-05-14 T19：追加 `*_created_24h` gauges，供治理 Agent / 前端直接顯示最近 24h 事實量，避免 counter 暖機造成 false red |
 | `apps/api/src/main.py` `/metrics` | 2026-05-14 T18：追加 DB-derived SLO emitter，讓既有 `awoooi-api` scrape job 取得底層 series |

 ## 決策理由
--- a/ops/monitoring/slo-rules.yml
+++ b/ops/monitoring/slo-rules.yml
@@ -45,9 +45,12 @@ groups:
          /
          sum(rate(approval_records_high_confidence_total[1h]))

-      # SLO 4: KM 增長率 = 24h increase (絕對值，不做 rate)
+      # SLO 4: KM 增長率 = DB-derived 24h gauge；fallback 給舊 counter history
      - record: sli:km_growth_rate:24h
-        expr: increase(knowledge_entries_total[24h])
+        expr: |
+          max(knowledge_entries_created_24h)
+          or
+          increase(knowledge_entries_total[24h])

      # -----------------------------------------------------------------------
      # Error Budget Recording Rules（輔助 Grafana 顯示）
@@ -248,7 +251,7 @@ groups:
          summary: "SLO KM 增長率嚴重不足（< 5 筆/day）— 疑似 KM 鏈斷裂"
          description: "過去 24h KM 新增 {{ $value }} 筆，遠低於目標 20 筆/day，飛輪學習迴圈疑似中斷。"
          runbook: |
-            1. 確認 knowledge_entries_total counter 是否正常遞增
+            1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
            2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
            3. 確認 auto_execute 後 KM 寫入路徑（feedback_flywheel_km_write_gap.md）
            4. 手動執行 POST /api/v1/governance/check
--- a/ops/monitoring/tests/test_slo_rules.yaml
+++ b/ops/monitoring/tests/test_slo_rules.yaml
@@ -35,7 +35,8 @@ tests:
      - expr: sli:autonomy_rate:5m
        eval_time: 15m
        exp_samples:
-          - value: 0.8
+          - labels: '{__name__="sli:autonomy_rate:5m"}'
+            value: 0.8

  # ---- SLI 1: 自主化率 = 100%（無 human_required）----
  - interval: 1m
@@ -47,7 +48,8 @@ tests:
      - expr: sli:autonomy_rate:5m
        eval_time: 15m
        exp_samples:
-          - value: 1.0
+          - labels: '{__name__="sli:autonomy_rate:5m"}'
+            value: 1.0

  # ---- SLI 2: 決策準確率 = 90% (success=9, auto_executed=10) ----
  - interval: 1m
@@ -61,7 +63,8 @@ tests:
      - expr: sli:decision_accuracy:5m
        eval_time: 15m
        exp_samples:
-          - value: 0.9
+          - labels: '{__name__="sli:decision_accuracy:5m"}'
+            value: 0.9

  # ---- SLI 4: KM 增長率（24h increase）----
  - interval: 1m
@@ -74,7 +77,23 @@ tests:
        eval_time: 25h
        exp_samples:
          # increase over 24h = 1440 samples × 1/min
-          - value: 1440
+          - labels: '{__name__="sli:km_growth_rate:24h"}'
+            value: 1440
+
+  # ---- SLI 4: DB-derived gauge 優先，避免 counter 新上線暖機誤報 0 ----
+  - interval: 1m
+    name: "sli:km_growth_rate:24h 應優先使用 knowledge_entries_created_24h"
+    input_series:
+      - series: "knowledge_entries_created_24h"
+        values: "25x30"
+      - series: "knowledge_entries_total"
+        values: "100x30"
+    promql_expr_test:
+      - expr: sli:km_growth_rate:24h
+        eval_time: 15m
+        exp_samples:
+          - labels: '{__name__="sli:km_growth_rate:24h"}'
+            value: 25

  # ============================================================
  # Alert Tests — SLO 1: 自主化率
@@ -120,6 +139,10 @@ tests:
              burn_window: 3d
              team: ai
              auto_repair: "false"
+            exp_annotations:
+              summary: "SLO 自主化率 slow burn（長期趨勢偏低）"
+              description: "自主化率長期低於目標，累積 error budget 消耗率偏高，建議本週 review。"
+              runbook: "分析近 7d 數據，是否需要重訓或調整 confidence threshold。"

  # ---- 負測: 自主化率 = 85% → SlowBurn 不觸發 ----
  - interval: 1m
@@ -157,6 +180,10 @@ tests:
              burn_window: 3d
              team: ai
              auto_repair: "false"
+            exp_annotations:
+              summary: "SLO 決策準確率 slow burn（長期趨勢偏低）"
+              description: "決策準確率長期低於目標，累積 error budget 消耗偏高。"
+              runbook: "近 7d verifier 失敗分析，考慮 playbook fine-tune。"

  # ---- 負測: 決策準確率 = 92% → SlowBurn 不觸發 ----
  - interval: 1m
@@ -192,6 +219,14 @@ tests:
              slo_name: km_growth_rate
              team: ai
              auto_repair: "false"
+            exp_annotations:
+              summary: "SLO KM 增長率嚴重不足（< 5 筆/day）— 疑似 KM 鏈斷裂"
+              description: "過去 24h KM 新增 0 筆，遠低於目標 20 筆/day，飛輪學習迴圈疑似中斷。"
+              runbook: |
+                1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
+                2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
+                3. 確認 auto_execute 後 KM 寫入路徑（feedback_flywheel_km_write_gap.md）
+                4. 手動執行 POST /api/v1/governance/check

  # ---- 正測: KM 增長率 = 3/day → Critical 觸發（< 5）----
  - interval: 30m
@@ -210,6 +245,14 @@ tests:
              slo_name: km_growth_rate
              team: ai
              auto_repair: "false"
+            exp_annotations:
+              summary: "SLO KM 增長率嚴重不足（< 5 筆/day）— 疑似 KM 鏈斷裂"
+              description: "過去 24h KM 新增 2.9999999999999996 筆，遠低於目標 20 筆/day，飛輪學習迴圈疑似中斷。"
+              runbook: |
+                1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
+                2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
+                3. 確認 auto_execute 後 KM 寫入路徑（feedback_flywheel_km_write_gap.md）
+                4. 手動執行 POST /api/v1/governance/check

  # ---- 負測: KM 增長率 = 30/day → Critical 不觸發 ----
  - interval: 1m
@@ -240,3 +283,7 @@ tests:
              slo_name: km_growth_rate
              team: ai
              auto_repair: "false"
+            exp_annotations:
+              summary: "SLO KM 增長率偏低（< 20 筆/day）"
+              description: "過去 24h KM 新增 14.976000000000393 筆，低於目標 20 筆/day。"
+              runbook: "查 KM 寫入路徑（auto_execute 後 _write_execution_result_to_km），確認飛輪 KM 閉環正常。"
--- a/scripts/ops/deploy-alerts.sh
+++ b/scripts/ops/deploy-alerts.sh
@@ -6,45 +6,57 @@

 set -eo pipefail

-RULES_FILE="ops/monitoring/alerts-unified.yml"
+ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml"
+SLO_RULES_FILE="ops/monitoring/slo-rules.yml"
 TARGET_HOST="192.168.0.110"
-TARGET_PATH="/home/wooo/monitoring/alerts.yml"
+TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml"
+TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml"
 PROMETHEUS_URL="http://${TARGET_HOST}:9090"
 DRY_RUN="${1:-}"

 log() { echo "[$(date '+%H:%M:%S')] $*"; }

 # 確認檔案存在
-if [ ! -f "$RULES_FILE" ]; then
-    echo "ERROR: $RULES_FILE not found"
-    exit 1
-fi
+for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
+    if [ ! -f "$file" ]; then
+        echo "ERROR: $file not found"
+        exit 1
+    fi
+done

 # 驗證 YAML 語法
-if python3 -c "import yaml; yaml.safe_load(open('$RULES_FILE'))" 2>/dev/null; then
-    :
-elif ruby -e "require 'yaml'; YAML.load_file('$RULES_FILE')" 2>/dev/null; then
-    :
-else
-    echo "ERROR: YAML syntax error or no YAML parser available"
-    exit 1
-fi
+for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
+    if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then
+        :
+    elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then
+        :
+    else
+        echo "ERROR: YAML syntax error or no YAML parser available: $file"
+        exit 1
+    fi
+done
 log "✅ YAML 語法驗證通過"

 # Dry run 模式
 if [ "$DRY_RUN" = "--dry-run" ]; then
-    log "DRY RUN: would deploy $RULES_FILE to ${TARGET_HOST}:${TARGET_PATH}"
-    RULE_COUNT=$(grep -c "alert:" "$RULES_FILE")
-    log "規則數量: $RULE_COUNT 條"
+    log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}"
+    log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}"
+    ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE")
+    SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE")
+    SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE")
+    log "告警規則數量: $ALERT_COUNT 條；SLO recording: $SLO_RECORD_COUNT 條；SLO alerts: $SLO_ALERT_COUNT 條"
    exit 0
 fi

 # 備份現有規則
-ssh wooo@${TARGET_HOST} "cp ${TARGET_PATH} ${TARGET_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
+ssh wooo@${TARGET_HOST} "\
+    cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
+    cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
 log "✅ 現有規則已備份"

 # 部署新規則
-scp "$RULES_FILE" wooo@${TARGET_HOST}:${TARGET_PATH}
+scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH}
+scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH}
 log "✅ 規則已複製到 ${TARGET_HOST}"

 # Reload Prometheus
@@ -72,4 +84,15 @@ for rule in "${KEY_RULES[@]}"; do
    fi
 done

+KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical")
+for rule in "${KEY_SLO_RULES[@]}"; do
+    EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
+    if [ "$EXISTS" = "OK" ]; then
+        log "✅ $rule"
+    else
+        echo "❌ $rule 未找到"
+        exit 1
+    fi
+done
+
 log "🎉 部署完成！所有關鍵規則已生效"