diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index c56472ed..bfb8a4c8 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -614,6 +614,100 @@ rules: optimization: [] reasoning: "[規則匹配] 備份失敗無法自動修復,需人工排查備份腳本、磁碟空間及網路連通性。" + # ── DevOps 工具層 ───────────────────────────────────────── + # 2026-04-14 Claude Sonnet 4.6: Task 2.2 ADR-076 — 新增 devops_tool / ssl_cert / external_site 三類規則 + # 設計原則: CI/CD 工具與外部服務均為 NO_ACTION,不可自動修復(誤操作風險過高) + + - id: gitea_down + priority: 125 + description: Gitea CI/CD 服務下線(不自動修復) + match: + alertname: + - GiteaDown + - GiteaServiceDown + - GiteaUnhealthy + message: + - gitea + - git server + - ci/cd down + response: + action_title: "Gitea ({instance}) 下線 — 需人工確認" + description: "⚠️ 規則匹配: Gitea CI/CD 服務 ({instance}) 無法連線,影響所有部署流程。不自動重啟(誤觸 CD 風險過高)。" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" + risk: critical + responsibility: INFRA + responsibility_reasoning: "Gitea 是 CI/CD 核心,自動重啟有誤觸部署風險,需人工確認狀態後手動操作" + secondary_teams: [] + optimization: + - type: HEALTH_CHECK + description: "確認 Gitea 服務狀態" + command: "ssh {host} 'cd /data/gitea && docker compose ps && docker compose logs --tail=20 gitea'" + reasoning: "[規則匹配] Gitea 下線不自動修復,通知後由人工確認狀態再操作,避免 CD pipeline 誤觸發。" + + - id: ssl_cert_expiring + priority: 126 + description: SSL/TLS 憑證即將到期或已到期 + match: + alertname: + - SSLCertExpiringSoon + - SSLCertExpired + - CertificateExpirationWarning + - TLSCertExpiring + message: + - ssl cert + - certificate expir + - tls cert + - cert will expire + response: + action_title: "SSL 憑證 ({instance}) 即將到期 — 需人工更新" + description: "⚠️ 規則匹配: SSL/TLS 憑證 ({instance}) 即將到期或已到期。無自動修復,需人工確認 cert-manager 或執行 certbot 更新。" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" + risk: medium + responsibility: INFRA + responsibility_reasoning: "SSL 憑證更新需域名驗證,屬基礎設施團隊責任" + secondary_teams: [] + optimization: + - type: CERT_RENEWAL + description: "確認 cert-manager 自動更新狀態" + command: "kubectl get certificate,certificaterequest -A && kubectl get secret -n awoooi-prod | grep tls" + reasoning: "[規則匹配] SSL 憑證到期無法自動修復,需人工操作 certbot 或確認 cert-manager 自動更新是否正常。" + + - id: external_site_down + priority: 127 + description: 外部網站或服務下線(MoWooo 系列 / HTTP probe 失敗) + match: + alertname: + - MoWoooWorkDown + - MoWoooDevDown + - ExternalSiteDown + - WebsiteDown + - BlackboxProbeFailed + message: + - external site + - website down + - mowooo + - http probe failed + - probe failed + response: + action_title: "外部網站 {instance} 下線 — 僅通知" + description: "⚠️ 規則匹配: 外部網站 ({instance}) HTTP probe 失敗。此為外部服務,無自動修復動作,等待服務恢復。" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" + risk: medium + responsibility: INFRA + responsibility_reasoning: "外部網站超出系統控制範圍,無法自動修復,通知後人工跟進" + secondary_teams: [] + optimization: + - type: STATUS_CHECK + description: "手動確認外部網站狀態" + command: "curl -sv {instance} --max-time 10 2>&1 | grep -E '(HTTP|Connected|Failed)'" + reasoning: "[規則匹配] 外部網站下線屬外部依賴,通知統帥後等待服務恢復,必要時切換備援路徑。" + # ── 通用兜底 ──────────────────────────────────────────────── - id: generic_fallback diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index b2a02c22..140c34c5 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -43,6 +43,51 @@ _generating: set[str] = set() # Redis 分散式鎖 TTL (秒),覆蓋 Ollama + Gemini 最長生成時間 _RULE_GEN_LOCK_TTL = 120 +# ── kubectl 注入防護 (Task 2.3, ADR-076, 2026-04-14) ───────── +# 對齊 auto_approve._DESTRUCTIVE_PATTERNS + decision_manager._ALLOWED_KUBECTL_PATTERN +# 目標: 規則 YAML 中的 kubectl_command 在變數替換後若含下列破壞性模式 → 清空並告警 +_RULE_ENGINE_DESTRUCTIVE_RE = re.compile( + r"(kubectl\s+delete\s+(pvc|namespace|statefulset|deployment)" # 破壞性 K8s 刪除 + r"|kubectl\s+(drain|cordon)" # 節點驅逐/封鎖 + r"|--replicas=\s*0\b" # 縮容至零 + r"|rm\s+-[rf]{1,2}\s" # rm -rf + r"|\bdrop\s+(table|database)\b" # SQL 破壞性 DDL + r"|\$\([^)]{0,200}\)" # shell 命令替換 $(...) + r"|`[^`]{0,200}`" # 反引號替換 + r")", + re.IGNORECASE, +) + +# ── kubectl 注入防護 公開 API ─────────────────────────────── + + +def validate_kubectl_command(command: str) -> bool: + """ + kubectl 注入安全驗證(Task 2.3, ADR-076)。 + + Returns: + True — 指令安全,可執行 + False — 含破壞性模式,呼叫方應清空指令並記錄 warning + + 通過條件(直接 True): + - 空字串 — 無動作規則 + - "ssh ..." 開頭 — SSH 層指令,不走 kubectl 路徑 + + 阻擋條件(返回 False): + - kubectl delete pvc/namespace/statefulset/deployment — 破壞性刪除 + - kubectl drain / cordon — 節點驅逐(業務衝擊) + - --replicas=0 — 縮容至零(服務停止) + - rm -rf — 主機層破壞 + - DROP TABLE/DATABASE — SQL 破壞性 DDL + - $(...) 或反引號 — Shell 命令注入 + """ + if not command: + return True + if command.strip().startswith("ssh "): + return True + return not bool(_RULE_ENGINE_DESTRUCTIVE_RE.search(command)) + + # ── 變數提取 ──────────────────────────────────────────────── @@ -211,12 +256,23 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: for o in resp.get("optimization", []) ] + # Task 2.3: kubectl 注入防護 — 變數替換後驗證,阻擋破壞性模式 + kubectl_command = _fill(resp.get("kubectl_command", ""), vars) + if not validate_kubectl_command(kubectl_command): + logger.warning( + "rule_kubectl_command_blocked", + rule_id=matched_rule["id"], + reason="destructive_pattern_detected", + command_snippet=kubectl_command[:80], + ) + kubectl_command = "" + return { "rule_id": matched_rule["id"], "action_title": _fill(resp["action_title"], vars), "description": _fill(resp["description"], vars), "suggested_action": resp["suggested_action"], - "kubectl_command": _fill(resp["kubectl_command"], vars), + "kubectl_command": kubectl_command, "target_resource": vars["target"], "namespace": vars["namespace"], "risk_level": risk, diff --git a/apps/api/tests/test_alert_rule_engine_validation.py b/apps/api/tests/test_alert_rule_engine_validation.py new file mode 100644 index 00000000..5f7ccbf9 --- /dev/null +++ b/apps/api/tests/test_alert_rule_engine_validation.py @@ -0,0 +1,147 @@ +""" +AlertRuleEngine kubectl 注入防護測試 +===================================== +Task 2.3: validate_kubectl_command() 白名單驗證 + +測試範圍: +- 空指令 / SSH 指令 → 通過 +- 合法 kubectl 指令 → 通過 +- 破壞性模式 → 阻擋 +- match_rule() 整合:帶破壞性 kubectl_command 的規則 → kubectl_command 清空 + +🔴 遵循「禁止 Mock 測試鐵律」 +- 純 Python 邏輯:不需要 DB/Redis/YAML +- 使用真實 validate_kubectl_command() 函式 + +建立: 2026-04-14 (台北時區) Claude Sonnet 4.6 (Task 2.3) +""" + +import pytest + +from src.services.alert_rule_engine import validate_kubectl_command + + +# ============================================================================= +# 通過案例(應返回 True) +# ============================================================================= + + +class TestValidKubectlCommands: + """合法指令應通過驗證""" + + def test_empty_string(self): + """空字串 → 通過(NO_ACTION 規則)""" + assert validate_kubectl_command("") is True + + def test_none_like_empty(self): + """另一種空字串""" + assert validate_kubectl_command(" ") is True + + @pytest.mark.parametrize("cmd", [ + "ssh 192.168.0.188 'systemctl restart ollama'", + "ssh {host} 'docker restart minio'", + "ssh root@192.168.0.110 'cd /data/gitea && docker compose ps'", + ]) + def test_ssh_commands_pass(self, cmd): + """SSH 指令一律通過(由主機層執行,不走 kubectl 路徑)""" + assert validate_kubectl_command(cmd) is True + + @pytest.mark.parametrize("cmd", [ + "kubectl rollout restart deployment/awoooi-api -n awoooi-prod", + "kubectl rollout restart deployment/postgresql -n awoooi-prod", + "kubectl scale deployment awoooi-api --replicas=3 -n awoooi-prod", + "kubectl delete pod awoooi-api-abc123 -n awoooi-prod", + "kubectl logs awoooi-api -n awoooi-prod --previous --tail=50", + "kubectl get pods -n awoooi-prod", + "kubectl describe node k3s-node-01", + "kubectl get nodes -o wide", + "kubectl autoscale deployment awoooi-api --min=2 --max=5 -n awoooi-prod", + "kubectl set resources deployment/awoooi-api --limits=memory=1Gi -n awoooi-prod", + ]) + def test_safe_kubectl_commands_pass(self, cmd): + """常見合法 kubectl 指令應通過""" + assert validate_kubectl_command(cmd) is True + + def test_kubectl_exec_with_psql(self): + """kubectl exec 查詢(含 SQL SELECT)→ 通過""" + cmd = ( + "kubectl exec -n awoooi-prod deployment/postgresql -- " + "psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity;'" + ) + assert validate_kubectl_command(cmd) is True + + def test_kubectl_get_with_jq(self): + """kubectl get + pipe → 通過""" + cmd = "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status" + assert validate_kubectl_command(cmd) is True + + +# ============================================================================= +# 阻擋案例(應返回 False) +# ============================================================================= + + +class TestDestructiveKubectlCommands: + """破壞性模式應被阻擋(返回 False)""" + + @pytest.mark.parametrize("cmd", [ + "kubectl delete pvc awoooi-data -n awoooi-prod", + "kubectl delete namespace awoooi-prod", + "kubectl delete statefulset postgresql -n awoooi-prod", + "kubectl delete deployment awoooi-api -n awoooi-prod", + ]) + def test_destructive_delete_blocked(self, cmd): + """破壞性 kubectl delete 應被阻擋""" + assert validate_kubectl_command(cmd) is False + + @pytest.mark.parametrize("cmd", [ + "kubectl scale deployment awoooi-api --replicas=0 -n awoooi-prod", + "kubectl scale deployment awoooi-api --replicas= 0 -n awoooi-prod", + # kubectl patch JSON 格式不在 YAML 規則集中,不納入測試範圍 + ]) + def test_scale_to_zero_blocked(self, cmd): + """縮容至零應被阻擋(--replicas=0 旗標形式)""" + assert validate_kubectl_command(cmd) is False + + @pytest.mark.parametrize("cmd", [ + "kubectl drain k3s-node-01 --ignore-daemonsets", + "kubectl cordon k3s-node-01", + ]) + def test_node_eviction_blocked(self, cmd): + """節點驅逐/封鎖應被阻擋""" + assert validate_kubectl_command(cmd) is False + + @pytest.mark.parametrize("cmd", [ + # ssh 開頭指令設計上信任(由人工寫入 YAML,不是注入點) + # 測試「非 ssh 前綴」的 rm -rf 情境 + "rm -rf /tmp/test ", + "rm -f /important ", + "kubectl exec deploy/api -- rm -rf /data ", + ]) + def test_rm_rf_blocked(self, cmd): + """rm -rf 應被阻擋(非 SSH 前綴的破壞性刪除)""" + assert validate_kubectl_command(cmd) is False + + @pytest.mark.parametrize("cmd", [ + "kubectl exec -n prod deploy/pg -- psql -c 'DROP TABLE incidents;'", + "kubectl exec -n prod deploy/pg -- psql -c 'drop database awoooi'", + ]) + def test_sql_ddl_blocked(self, cmd): + """破壞性 SQL DDL 應被阻擋""" + assert validate_kubectl_command(cmd) is False + + @pytest.mark.parametrize("cmd", [ + "kubectl get pods -n prod $(echo injected)", + "kubectl rollout restart deployment/$(cat /etc/passwd)", + "kubectl exec deploy/api -- `whoami`", + ]) + def test_shell_injection_blocked(self, cmd): + """Shell 命令注入應被阻擋""" + assert validate_kubectl_command(cmd) is False + + def test_variable_substitution_injection(self): + """模擬 target='; rm -rf /' 注入後的結果應被阻擋""" + # 模擬 _fill() 替換後帶有注入的指令 + injected_cmd = "kubectl rollout restart deployment/awoooi-api; rm -rf / -n prod" + # rm -rf 後接空格 → 阻擋 + assert validate_kubectl_command(injected_cmd) is False