From d77b2add73cb9b481d375b1bd529e32aef684367 Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 11 Apr 2026 21:33:04 +0800 Subject: [PATCH] =?UTF-8?q?fix(review):=20=E9=A6=96=E5=B8=AD=E6=9E=B6?= =?UTF-8?q?=E6=A7=8B=E5=B8=AB=20Code=20Review=20=E4=BF=AE=E8=A3=9C=20?= =?UTF-8?q?=E2=80=94=20I1=20get=5Fincident=5Ftype=20=E9=82=8F=E8=BC=AF?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20+=20=E6=B8=AC=E8=A9=A6=E8=A3=9C=E5=85=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code Review 發現 2 個 Critical + 2 個 Important 問題: Critical: - rule.id 語意為「規則識別符」,與 incident_type 命名空間不同,不可混用 移除 rule_id fallback 路徑,YAML 匹配無 incident_type 時 fall through 靜態 dict - get_incident_type() 關鍵路徑無測試覆蓋 新增 test_get_incident_type.py:11 測試、4 類別(靜態/YAML優先/YAML錯誤/custom)全過 Important: - ALERTNAME_TO_TYPE deferred import 移至模組頂層(無 circular 風險) - alert_types.py TODO 過期 → 更新為 I1 整合後正確說明 技術債記錄:NetworkPolicy ArgoCD egress ClusterIP 10.43.16.201/32 需 ArgoCD 重裝後更新 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/constants/alert_types.py | 5 +- apps/api/src/services/alert_rule_engine.py | 15 +- apps/api/tests/test_get_incident_type.py | 167 +++++++++++++++++++++ docs/LOGBOOK.md | 48 ++++++ 4 files changed, 224 insertions(+), 11 deletions(-) create mode 100644 apps/api/tests/test_get_incident_type.py diff --git a/apps/api/src/constants/alert_types.py b/apps/api/src/constants/alert_types.py index 03fc2e77..48acfd01 100644 --- a/apps/api/src/constants/alert_types.py +++ b/apps/api/src/constants/alert_types.py @@ -6,8 +6,9 @@ alertname → incident_type 靜態對應表 來源: BUG-008 修復 2026-04-11(9筆 → 56筆,涵蓋 alerts-unified.yml 全部 alertname) 遷移: M3 2026-04-11 — 從 webhooks.py 內聯 dict 抽至此模組 -TODO (I1): 下 Sprint 整合 ADR-064 Rule Engine,改用 YAML 規則動態推斷; - 此靜態 dict 為可接受中間狀態。 +整合狀態 (I1, 2026-04-11): ADR-064 Rule Engine 已整合,見 alert_rule_engine.get_incident_type()。 +此靜態 dict 為 Layer 2 fallback:YAML incident_type(Layer 1)→ 此 dict → "custom"(Layer 3)。 +擴展方式:在 alert_rules.yaml 中新增 incident_type 欄位;此 dict 僅需補無 YAML 規則的 alertname。 """ # alertname → incident_type 對應(56 筆) diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index 83b5d299..8699df0c 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -31,6 +31,8 @@ import httpx import structlog import yaml +from src.constants.alert_types import ALERTNAME_TO_TYPE + logger = structlog.get_logger(__name__) RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml" @@ -267,14 +269,13 @@ def get_incident_type(alertname: str) -> str: 從 YAML 規則動態推斷 incident_type,取代 webhooks.py 靜態 dict。 優先順序: - 1. alert_rules.yaml 中 match.alertname 完全匹配 → 使用 rule.incident_type + 1. alert_rules.yaml 中 match.alertname 完全匹配 且 rule 有 incident_type 欄位 → 使用之 2. ALERTNAME_TO_TYPE 靜態 dict fallback(constants/alert_types.py) 3. "custom"(兜底) - rule.incident_type 可選欄位;若 YAML 規則未設則跳過進 fallback。 + 注意:YAML rule.id 語意為「規則識別符」,不等於 incident_type; + 兩者命名空間不同,不可混用。YAML 未設 incident_type 時一律走 fallback。 """ - from src.constants.alert_types import ALERTNAME_TO_TYPE - try: rules = _load_rules() for rule in rules: @@ -282,14 +283,10 @@ def get_incident_type(alertname: str) -> str: continue alertnames = rule.get("match", {}).get("alertname", []) if alertname in alertnames: - # YAML 有 incident_type 欄位優先用 + # 只有 YAML 明確設定 incident_type 才採用,否則 fall through 到靜態 dict incident_type = rule.get("incident_type") if incident_type: return incident_type - # 無 incident_type 欄位 → 用 rule id 作為 incident_type(語意一致) - rule_id = rule.get("id", "") - if rule_id: - return rule_id break except Exception: pass diff --git a/apps/api/tests/test_get_incident_type.py b/apps/api/tests/test_get_incident_type.py new file mode 100644 index 00000000..e6343693 --- /dev/null +++ b/apps/api/tests/test_get_incident_type.py @@ -0,0 +1,167 @@ +""" +I1 ADR-064 get_incident_type() 整合測試 +======================================== +測試策略 (feedback_no_mock_testing.md): +- 直接呼叫 get_incident_type(),不 Mock YAML / 靜態 dict +- 驗證三層降級:YAML incident_type → 靜態 dict → "custom" +- 2026-04-11 Claude Sonnet 4.6: Code Review C1 修補 +""" + +import pytest + +from src.constants.alert_types import ALERTNAME_TO_TYPE +from src.services.alert_rule_engine import get_incident_type + + +class TestGetIncidentTypeStaticFallback: + """Layer 2: 靜態 dict fallback(YAML 無 incident_type 欄位時)""" + + def test_known_alertname_returns_static_dict_value(self): + """HostHighCpuLoad → 靜態 dict 應回傳 host_cpu""" + result = get_incident_type("HostHighCpuLoad") + assert result == "host_cpu" + + def test_known_alertname_k8s(self): + """KubePodCrashLooping → k8s_pod_crash""" + result = get_incident_type("KubePodCrashLooping") + assert result == "k8s_pod_crash" + + def test_known_alertname_database(self): + """PostgreSQLDown → database_down""" + result = get_incident_type("PostgreSQLDown") + assert result == "database_down" + + def test_known_alertname_service(self): + """SentryDown → service_down""" + result = get_incident_type("SentryDown") + assert result == "service_down" + + def test_known_alertname_ssl(self): + """ExternalSiteSSLExpiringSoon → ssl_expiry""" + result = get_incident_type("ExternalSiteSSLExpiringSoon") + assert result == "ssl_expiry" + + +class TestGetIncidentTypeCustomFallback: + """Layer 3: unknown alertname → "custom" """ + + def test_unknown_alertname_returns_custom(self): + result = get_incident_type("SomeCompletelyUnknownAlert") + assert result == "custom" + + def test_empty_string_returns_custom(self): + result = get_incident_type("") + assert result == "custom" + + +class TestGetIncidentTypeYamlPriority: + """Layer 1: YAML rule 有 incident_type 欄位時優先使用""" + + def test_yaml_incident_type_overrides_static_dict(self, tmp_path, monkeypatch): + """若 YAML rule 明確設定 incident_type,應回傳 YAML 值(而非靜態 dict 值)""" + import yaml + from src.services import alert_rule_engine as engine + + yaml_content = { + "version": "1.0.0", + "rules": [ + { + "id": "test_rule", + "priority": 1, + "description": "test", + "match": {"alertname": ["HostHighCpuLoad"]}, + "incident_type": "custom_cpu_override", + "response": { + "action_title": "test", + "suggested_action": "NO_ACTION", + "risk": "low", + "responsibility": "INFRA", + "responsibility_reasoning": "test", + "secondary_teams": [], + "reasoning": "test", + }, + } + ], + } + rules_file = tmp_path / "alert_rules.yaml" + rules_file.write_text(yaml.dump(yaml_content)) + + # 替換 RULES_FILE 路徑並清除 lru_cache + monkeypatch.setattr(engine, "RULES_FILE", rules_file) + engine._load_rules.cache_clear() + + result = get_incident_type("HostHighCpuLoad") + assert result == "custom_cpu_override" + + # 清理 cache,還原給後續測試 + engine._load_rules.cache_clear() + + def test_yaml_match_without_incident_type_falls_through_to_static( + self, tmp_path, monkeypatch + ): + """YAML rule 匹配到 alertname 但無 incident_type 欄位 → 必須 fallthrough 到靜態 dict""" + import yaml + from src.services import alert_rule_engine as engine + + yaml_content = { + "version": "1.0.0", + "rules": [ + { + "id": "host_cpu", # id 值與靜態 dict value 相同,但這是巧合,不應被依賴 + "priority": 1, + "description": "test", + "match": {"alertname": ["HostHighCpuLoad"]}, + # 故意不設 incident_type + "response": { + "action_title": "test", + "suggested_action": "NO_ACTION", + "risk": "low", + "responsibility": "INFRA", + "responsibility_reasoning": "test", + "secondary_teams": [], + "reasoning": "test", + }, + } + ], + } + rules_file = tmp_path / "alert_rules.yaml" + rules_file.write_text(yaml.dump(yaml_content)) + + monkeypatch.setattr(engine, "RULES_FILE", rules_file) + engine._load_rules.cache_clear() + + result = get_incident_type("HostHighCpuLoad") + # 靜態 dict 應回傳 host_cpu(不應回傳 rule.id) + assert result == ALERTNAME_TO_TYPE["HostHighCpuLoad"] + assert result == "host_cpu" + + engine._load_rules.cache_clear() + + +class TestGetIncidentTypeYamlError: + """YAML 載入失敗時應 graceful fallback""" + + def test_corrupted_yaml_falls_back_to_static(self, tmp_path, monkeypatch): + from src.services import alert_rule_engine as engine + + rules_file = tmp_path / "alert_rules.yaml" + rules_file.write_text("{{invalid yaml{{{{") + + monkeypatch.setattr(engine, "RULES_FILE", rules_file) + engine._load_rules.cache_clear() + + result = get_incident_type("HostHighCpuLoad") + assert result == "host_cpu" # 靜態 dict fallback + + engine._load_rules.cache_clear() + + def test_missing_yaml_falls_back_to_static(self, tmp_path, monkeypatch): + from src.services import alert_rule_engine as engine + + monkeypatch.setattr(engine, "RULES_FILE", tmp_path / "nonexistent.yaml") + engine._load_rules.cache_clear() + + result = get_incident_type("KubeNodeNotReady") + assert result == "k8s_node_failure" + + engine._load_rules.cache_clear() diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 33d65e96..0ef5d8d6 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,54 @@ --- +## 📍 當前狀態 (2026-04-11 深夜 — 首席架構師 Code Review 修補完成) + +### Code Review 修補 — I1/M3/D1/ArgoCD 重審 (首席架構師觸發) + +| 等級 | 問題 | 修復方式 | Commit | +|------|------|---------|--------| +| Critical | `get_incident_type()` rule.id 不應作為 incident_type fallback(命名空間不同,有語意污染風險)| 移除 rule_id 路徑,YAML 匹配無 incident_type → fall through 靜態 dict | 待 push | +| Critical | `get_incident_type()` 無測試覆蓋(關鍵路徑)| 新增 test_get_incident_type.py,11 個測試 4 個類別全過 | 待 push | +| Important | ALERTNAME_TO_TYPE deferred import(無 circular risk)| 移至模組頂層 import | 待 push | +| Important | NetworkPolicy ClusterIP `10.43.16.201/32` 寫死(技術債)| 記錄為已知技術債,podSelector+namespaceSelector 已在同一 rule,下次 ArgoCD 重裝需更新 | LOGBOOK | +| Minor | alert_types.py TODO 已過期 | 更新為 I1 整合後的正確說明 | 待 push | + +**技術債記錄**:`k8s/awoooi-prod/02-network-policy.yaml` ArgoCD egress 規則中 `ipBlock: 10.43.16.201/32` 為 ClusterIP,若 ArgoCD 重裝此 IP 會變更,須同步更新。 + +--- + +## 📍 當前狀態 (2026-04-11 深夜 — MCP Security Code Review P0~P2 全修補) + +### MCP Security Code Review 修補 (commit f323633) + +| 等級 | 問題 | 修復方式 | +|------|------|---------| +| P0 | `_fetch_metrics_snapshot` 呼叫型別錯誤(str→dict)| `{"query": ...}` + 結果解析修正 | +| P1 | alertname PromQL injection | `_RE_SAFE_ALERTNAME` 白名單正則 | +| P1 | kubectl action 危險字元注入 | `_ALLOWED_KUBECTL_PATTERN` 白名單 | +| P1 | 6 個 `create_task()` GC 風險 | `_background_tasks` set + `_fire_and_forget()` helper | +| P1 | SSH Group B 無 known_hosts 可執行寫入 | 強制驗證 `SSH_MCP_KNOWN_HOSTS_FILE` 存在 | +| P2 | Sentry query 無語意驗證 | `_RE_SAFE_SENTRY_QUERY` 拒絕特殊字元 | +| P2 | ArgoCD `verify=False` 寫死 | `ARGOCD_VERIFY_TLS` 環境變數開關 | + +推送 Gitea → CI/CD 觸發中 + +--- + +## 📍 當前狀態 (2026-04-11 深夜 — M3+I1 Tech Debt 全清零) + +### M3 alertname_to_type 抽至 constants + I1 ADR-064 Rule Engine 整合 (2026-04-11 深夜) + +| 任務 | 說明 | Commit | +|------|------|--------| +| M3 | 56筆 alertname_to_type 從 webhooks.py 內聯 dict 抽至 `src/constants/alert_types.py` | 1ede9f9 | +| I1 | alert_rule_engine.py 新增 `get_incident_type(alertname)`:YAML規則→靜態dict→"custom" 三層降級 | 615822d | +| I1 | webhooks.py call site 改用 `get_incident_type(alertname)`,正式整合 ADR-064 Rule Engine | 615822d | + +**Backlog 全清零** ✅ — 唯一剩餘 B3 (Phase 15.5 Trace Context UI) 統帥裁示暫緩 + +--- + ## 📍 當前狀態 (2026-04-11 深夜 — D1 models.json 集中化完成) ### D1 models.json 集中化 (2026-04-11 深夜)