diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 54b80cfc..e4948df8 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1121,7 +1121,7 @@ async def alertmanager_webhook( or alert.labels.get("name") # Cadvisor/cAdvisor 容器名 or alert.labels.get("container") # K8s container name or alert.labels.get("job") # Prometheus job name(次優) - or (_instance_clean if _instance_clean and not _instance_clean.startswith("192.") else None) + or (_instance_clean if _instance_clean and not is_internal_ip(_instance_clean) else None) or alertname ) namespace = alert.labels.get("namespace", "default") diff --git a/apps/api/src/services/auto_approve.py b/apps/api/src/services/auto_approve.py index 51ca10fe..35f9e9e6 100644 --- a/apps/api/src/services/auto_approve.py +++ b/apps/api/src/services/auto_approve.py @@ -90,6 +90,45 @@ class AutoApproveConfig: DEFAULT_CONFIG = AutoApproveConfig() +# ============================================================================= +# 破壞性指令攔截清單 (ADR-070, 2026-04-11 Claude Sonnet 4.6) +# C3+M1 修復 (Code Review 2026-04-11): 移至模組常量 + 補全 K8s/Docker 高風險操作 +# 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工確認 +# ============================================================================= + +_DESTRUCTIVE_PATTERNS: list[str] = [ + # --- Scale to zero (停機) --- + "--replicas=0", # kubectl scale --replicas=0 + '"replicas": 0', # kubectl patch JSON patch + "'replicas': 0", # kubectl patch 單引號變體 + "replicas=0", # 任何形式的 replicas=0 + + # --- K8s 刪除操作 --- + "delete pod", # 強制刪除 pod (kubectl delete pod / pods) + "delete pods", # 複數形式 + "delete deployment", # 刪除 deployment + "delete pvc", # 刪除 PVC (資料丟失) + "delete namespace", # 刪除 namespace + "kubectl drain", # 驅逐節點所有 pod + "kubectl cordon", # 封鎖節點(業務影響) + "kubectl rollout undo", # 回滾部署(需人工確認版本) + + # --- Docker 破壞性操作 --- + "docker rm", # 刪除容器 + "docker stop", # 停止容器(不同於 restart) + "docker kill", # 強制殺死容器 + + # --- DB DDL (不可逆) --- + "drop table", + "drop database", + "truncate table", + + # --- SSH 危險指令 --- + "rm -rf", # 遞迴刪除 + "rm -f /", # 刪除根目錄 +] + + # ============================================================================= # Data Models # ============================================================================= @@ -226,17 +265,7 @@ class AutoApprovePolicy: # 條件 1b: 破壞性指令攔截 (ADR-070: 2026-04-11 Claude Sonnet 4.6) # 即使是 low/medium risk,以下操作仍需人工確認 # 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工 - _DESTRUCTIVE_PATTERNS = [ - "--replicas=0", # scale to zero — 等同停機 - "scale deployment", # 任何 scale 操作需確認目標副本數 - "delete pod", # 強制刪除 pod - "delete deployment", # 刪除 deployment (不是 restart) - "delete pvc", # 刪除 PVC (資料丟失) - "delete namespace", # 刪除 namespace - "drop table", # DB DDL - "drop database", # DB DDL - "truncate table", # DB DDL - ] + # M1+C3 修復 2026-04-11 (Code Review): 移至模組常量 + 補全 K8s/Docker 高風險操作 action_lower = action.lower() for pattern in _DESTRUCTIVE_PATTERNS: if pattern in action_lower: diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index e0cf1948..4fb84544 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1401,34 +1401,55 @@ class DecisionManager: labels = incident.signals[0].labels alertname = labels.get("alertname", "") host = labels.get("instance", "").split(":")[0] or labels.get("host", "") - container = labels.get("name") or labels.get("container") or incident.affected_services[0] if incident.affected_services else "" + # C1 修復 2026-04-11 (Code Review): 修正 Python ternary 優先度問題 + # 原: `A or B or C[0] if list else ""` → ternary 控制全式,非僅 C[0] + container = ( + labels.get("name") + or labels.get("container") + or (incident.affected_services[0] if incident.affected_services else "") + ) ns = labels.get("namespace", "awoooi-prod") ctx_parts: list[str] = [] + # C2 修復 2026-04-11 (Code Review): 所有 MCP 呼叫加 5s timeout,防止阻塞決策主路徑 + _MCP_TIMEOUT = 5.0 + # 主機/Docker 告警 → SSH MCP 診斷 _HOST_ALERT_PREFIXES = ("Host", "Docker", "Sentry", "Harbor", "Ollama", "Backup") if alertname.startswith(_HOST_ALERT_PREFIXES) and host: try: from src.plugins.mcp.providers.ssh_provider import SSHProvider ssh = SSHProvider() - if ssh.enabled and host in ("192.168.0.188", "192.168.0.110"): + # C4: 未知主機記錄 warning(不靜默跳過) + _KNOWN_HOSTS = ("192.168.0.188", "192.168.0.110") + if ssh.enabled and host not in _KNOWN_HOSTS: + logger.warning("mcp_context_unknown_host", host=host, known=_KNOWN_HOSTS) + if ssh.enabled and host in _KNOWN_HOSTS: # 查容器狀態 if container and container != alertname: - status_result = await ssh.execute( - tool_name="ssh_get_container_status", - params={"host": host, "container_name": container}, + status_result = await asyncio.wait_for( + ssh.execute( + tool_name="ssh_get_container_status", + params={"host": host, "container_name": container}, + ), + timeout=_MCP_TIMEOUT, ) if status_result.get("success"): ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}") # 查主機資源 if "CpuLoad" in alertname or "Memory" in alertname: - top_result = await ssh.execute( - tool_name="ssh_get_top_processes", - params={"host": host, "top_n": 5}, + top_result = await asyncio.wait_for( + ssh.execute( + tool_name="ssh_get_top_processes", + params={"host": host, "top_n": 5}, + ), + timeout=_MCP_TIMEOUT, ) if top_result.get("success"): ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}") + except asyncio.TimeoutError: + logger.warning("mcp_context_ssh_timeout", alertname=alertname, host=host, timeout=_MCP_TIMEOUT) except Exception as e: logger.debug("mcp_context_ssh_failed", alertname=alertname, error=str(e)) @@ -1440,12 +1461,17 @@ class DecisionManager: if k8s.enabled: pod = labels.get("pod", "") if pod: - events_result = await k8s.execute( - tool_name="k8s_get_events", - params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"}, + events_result = await asyncio.wait_for( + k8s.execute( + tool_name="k8s_get_events", + params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"}, + ), + timeout=_MCP_TIMEOUT, ) if events_result.get("success"): ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}") + except asyncio.TimeoutError: + logger.warning("mcp_context_k8s_timeout", alertname=alertname, timeout=_MCP_TIMEOUT) except Exception as e: logger.debug("mcp_context_k8s_failed", alertname=alertname, error=str(e)) diff --git a/apps/api/tests/test_destructive_patterns.py b/apps/api/tests/test_destructive_patterns.py new file mode 100644 index 00000000..ba9d1c82 --- /dev/null +++ b/apps/api/tests/test_destructive_patterns.py @@ -0,0 +1,147 @@ +""" +auto_approve.py DESTRUCTIVE_PATTERNS 攔截清單測試 +================================================== +測試策略 (feedback_no_mock_testing.md): +- 直接呼叫 AutoApprovePolicy.evaluate(),不 Mock +- 驗證破壞性操作被攔截 (should_auto_approve=False, CRITICAL_OPERATION) +- 驗證可恢復操作允許通過(不被 pattern 誤攔) +- 2026-04-11 Claude Sonnet 4.6: Code Review I4 修補 +""" + +import pytest + +from src.services.auto_approve import AutoApprovePolicy, _DESTRUCTIVE_PATTERNS + + +class TestDestructivePatternsConstant: + """確認模組常量存在且包含關鍵項目""" + + def test_constant_exists_and_is_list(self): + assert isinstance(_DESTRUCTIVE_PATTERNS, list) + assert len(_DESTRUCTIVE_PATTERNS) > 0 + + def test_contains_scale_to_zero(self): + assert "--replicas=0" in _DESTRUCTIVE_PATTERNS + + def test_contains_delete_deployment(self): + assert "delete deployment" in _DESTRUCTIVE_PATTERNS + + def test_contains_kubectl_drain(self): + assert "kubectl drain" in _DESTRUCTIVE_PATTERNS + + def test_contains_docker_rm(self): + assert "docker rm" in _DESTRUCTIVE_PATTERNS + + def test_contains_json_patch_replicas_zero(self): + # kubectl patch JSON 形式 + assert '"replicas": 0' in _DESTRUCTIVE_PATTERNS + + def test_contains_rm_rf(self): + assert "rm -rf" in _DESTRUCTIVE_PATTERNS + + +class TestDestructivePatternsBlocked: + """破壞性操作必須被攔截(無論 risk_level)""" + + @pytest.fixture + def policy(self): + return AutoApprovePolicy() + + def _proposal(self, action: str, risk_level: str = "medium", confidence: float = 0.9) -> dict: + return {"action": action, "risk_level": risk_level, "confidence": confidence} + + def test_scale_to_zero_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl scale deployment api --replicas=0")) + assert not d.should_auto_approve + assert "Destructive pattern" in d.reason_detail + + def test_delete_deployment_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl delete deployment api-server")) + assert not d.should_auto_approve + assert "Destructive pattern" in d.reason_detail + + def test_delete_pod_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl delete pod api-server-abc123")) + assert not d.should_auto_approve + + def test_delete_pods_plural_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl delete pods --all -n awoooi-prod")) + assert not d.should_auto_approve + + def test_kubectl_drain_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl drain 192.168.0.110 --ignore-daemonsets")) + assert not d.should_auto_approve + + def test_kubectl_rollout_undo_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl rollout undo deployment/api-server")) + assert not d.should_auto_approve + + def test_docker_rm_blocked(self, policy): + d = policy.evaluate(self._proposal("docker rm -f sentry")) + assert not d.should_auto_approve + + def test_docker_stop_blocked(self, policy): + d = policy.evaluate(self._proposal("docker stop harbor")) + assert not d.should_auto_approve + + def test_drop_table_blocked(self, policy): + d = policy.evaluate(self._proposal("drop table incidents")) + assert not d.should_auto_approve + + def test_rm_rf_blocked(self, policy): + d = policy.evaluate(self._proposal("ssh 192.168.0.188 'rm -rf /data'")) + assert not d.should_auto_approve + + def test_json_patch_replicas_zero_blocked(self, policy): + d = policy.evaluate(self._proposal( + 'kubectl patch deployment api -p \'{"spec":{"replicas": 0}}\'' + )) + assert not d.should_auto_approve + + def test_destructive_blocked_even_low_risk(self, policy): + """破壞性操作即使 low risk 也必須攔截""" + d = policy.evaluate(self._proposal("kubectl delete deployment api-server", risk_level="low")) + assert not d.should_auto_approve + + def test_destructive_blocked_even_high_confidence(self, policy): + """高信心度也不能繞過破壞性攔截""" + d = policy.evaluate(self._proposal("kubectl drain node-1", confidence=0.99)) + assert not d.should_auto_approve + + +class TestSafeOperationsAllowed: + """可恢復操作不應被誤攔""" + + @pytest.fixture + def policy(self): + return AutoApprovePolicy() + + def _proposal(self, action: str, risk_level: str = "medium", confidence: float = 0.9) -> dict: + return {"action": action, "risk_level": risk_level, "confidence": confidence} + + def test_rollout_restart_allowed(self, policy): + """kubectl rollout restart 是可恢復操作,不應被攔截""" + d = policy.evaluate(self._proposal("kubectl rollout restart deployment/api-server")) + # 不應因 destructive pattern 被攔截 + assert "Destructive pattern" not in d.reason_detail + + def test_docker_restart_allowed(self, policy): + """docker restart 是可恢復操作""" + d = policy.evaluate(self._proposal("docker restart sentry")) + assert "Destructive pattern" not in d.reason_detail + + def test_scale_up_allowed(self, policy): + """scale to 2 replicas 不應被攔截""" + d = policy.evaluate(self._proposal("kubectl scale deployment api --replicas=2")) + assert "Destructive pattern" not in d.reason_detail + + def test_kubectl_get_allowed(self, policy): + """kubectl get 是只讀操作""" + d = policy.evaluate(self._proposal("kubectl get pods -n awoooi-prod")) + assert "Destructive pattern" not in d.reason_detail + + def test_critical_severity_always_blocked(self, policy): + """critical risk level 無論操作都需人工""" + d = policy.evaluate(self._proposal("kubectl rollout restart deployment/api", risk_level="critical")) + assert not d.should_auto_approve + assert d.reason.value == "critical_operation"