From ed2a4838f22e20eca5545ee718ab894844e6f44a Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 30 Apr 2026 14:06:09 +0800 Subject: [PATCH] fix(auto): use action parser for repair gates --- apps/api/src/api/v1/webhooks.py | 12 +- apps/api/src/services/action_parser.py | 161 +++++++++++++++++- apps/api/src/services/alert_rule_engine.py | 44 ++--- apps/api/src/services/auto_approve.py | 37 +++- apps/api/tests/test_action_parser_safety.py | 6 + .../test_alert_rule_engine_validation.py | 12 +- apps/api/tests/test_cs1_auto_execute.py | 17 +- apps/api/tests/test_cs3_auto_execute.py | 10 +- apps/api/tests/test_destructive_patterns.py | 12 +- .../tests/test_rule_engine_auto_execute.py | 15 +- docs/LOGBOOK.md | 13 ++ 11 files changed, 279 insertions(+), 60 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 5b32e0bd..da34c680 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -34,6 +34,7 @@ from pydantic import BaseModel, Field from src.core.config import settings from src.core.constants import is_cicd_alertname, is_heartbeat_alertname from src.services.alert_rule_engine import get_incident_type, match_rule +from src.services.action_parser import is_safe_kubectl_action from src.core.logging import get_logger from src.core.metrics import record_alert_chain_success @@ -1059,15 +1060,13 @@ async def receive_alert( # 設計:confidence ≥ 0.85 + 非 CRITICAL + 非破壞性 + 有 kubectl 指令 → 直接執行 # 安全防線:CRITICAL / destructive patterns / NO_ACTION/INVESTIGATE/OBSERVE / 空 kubectl → 降級 PENDING if analysis_result: - from src.services.auto_approve import _DESTRUCTIVE_PATTERNS as _cs1_destr_patterns - _cs1_kubectl = analysis_result.kubectl_command.strip() if analysis_result.kubectl_command else "" _cs1_can_auto = ( bool(_cs1_kubectl) and analysis_result.confidence >= 0.85 and risk_level != RiskLevel.CRITICAL and _sa_val not in _non_destructive_actions - and not any(p in _cs1_kubectl.lower() for p in _cs1_destr_patterns) + and is_safe_kubectl_action(_cs1_kubectl) ) if _cs1_can_auto: try: @@ -1396,15 +1395,13 @@ async def _process_new_alert_background( # 設計:is_rule_based=True 確定性高,滿足條件直接執行,不等人工審核 # 安全防線:CRITICAL / destructive patterns / NO_ACTION / 空 kubectl → 全部降級 PENDING try: - from src.services.auto_approve import _DESTRUCTIVE_PATTERNS from src.models.approval import ApprovalRequest, ApprovalStatus from src.services.approval_execution import ApprovalExecutionService - _destructive_set = set(p.lower() for p in _DESTRUCTIVE_PATTERNS) _can_auto = ( bool(rule_kubectl) and rule_risk != RiskLevel.CRITICAL - and not any(p in rule_kubectl.lower() for p in _destructive_set) + and is_safe_kubectl_action(rule_kubectl) and "NO_ACTION" not in rule_action ) if _can_auto: @@ -1576,14 +1573,13 @@ async def _process_new_alert_background( logger.warning("shadow_auto_approve_failed", error=str(_shadow_err_cs3)) # 2026-04-27 Claude Sonnet 4.6: CS3 LLM 高信心自動執行(修法3擴展) - from src.services.auto_approve import _DESTRUCTIVE_PATTERNS as _cs3_destr_patterns # noqa: PLC0415 _cs3_kubectl = (analysis_result.kubectl_command or "").strip() _cs3_can_auto = ( bool(_cs3_kubectl) and analysis_result.confidence >= 0.85 and risk_level != RiskLevel.CRITICAL and "NO_ACTION" not in (analysis_result.action_title or "") - and not any(p in _cs3_kubectl.lower() for p in _cs3_destr_patterns) + and is_safe_kubectl_action(_cs3_kubectl) ) if _cs3_can_auto: try: diff --git a/apps/api/src/services/action_parser.py b/apps/api/src/services/action_parser.py index 4bb16fca..8d78a2dc 100644 --- a/apps/api/src/services/action_parser.py +++ b/apps/api/src/services/action_parser.py @@ -57,6 +57,8 @@ class ActionKind(StrEnum): READONLY = "readonly" ROLLOUT = "rollout" SCALE = "scale" + AUTOSCALE = "autoscale" + SET_RESOURCES = "set_resources" DELETE_POD = "delete_pod" @@ -81,13 +83,29 @@ def is_safe_kubectl_action(command: str) -> bool: return parse_kubectl_action(command).ok +def kubectl_safety_reason(command: str) -> str | None: + """Return None for a safe kubectl command, otherwise the parser reason. + + Non-kubectl commands are outside this parser's scope and return None so + SSH / host-repair gates can keep their own policy. + """ + + command = (command or "").strip() + if not command.lower().startswith("kubectl"): + return None + parsed = parse_kubectl_action(command) + return None if parsed.ok else parsed.reason + + def parse_kubectl_action(command: str) -> ParsedKubectlAction: """Parse and validate a kubectl command for auto-execute safety. The grammar is intentionally narrow: - readonly: get/describe/logs/top/version with bounded, known-safe flags - - rollout: rollout restart/undo on workload resources + - rollout: rollout restart on workload resources - scale: scale deployment/statefulset to a positive replica count + - autoscale: HPA bounds on deployment/statefulset with positive min/max + - set resources: CPU/memory requests/limits on deployment/statefulset - delete: delete one pod by name only """ @@ -124,6 +142,10 @@ def parse_kubectl_action(command: str) -> ParsedKubectlAction: return _parse_rollout(rest, namespace, namespace_flags) if verb == "scale": return _parse_scale(rest, namespace, namespace_flags) + if verb == "autoscale": + return _parse_autoscale(rest, namespace, namespace_flags) + if verb == "set": + return _parse_set(rest, namespace, namespace_flags) if verb == "delete": return _parse_delete(rest, namespace, namespace_flags) return _reject("unsupported_verb") @@ -241,7 +263,7 @@ def _parse_rollout( if len(tokens) < 2: return _reject("rollout_missing_args") subverb = tokens[0] - if subverb not in {"restart", "undo"}: + if subverb != "restart": return _reject("unsupported_rollout_subverb") resource_type, resource_name, rest = _split_resource_ref(tokens[1:]) @@ -308,6 +330,104 @@ def _parse_scale( ) +def _parse_autoscale( + tokens: list[str], + namespace: str | None, + namespace_flags: list[str], +) -> ParsedKubectlAction: + resource_type, resource_name, rest = _split_resource_ref(tokens) + if resource_type not in _SCALABLE_RESOURCES or not resource_name: + return _reject("invalid_autoscale_resource") + + min_replicas: int | None = None + max_replicas: int | None = None + cpu_percent: int | None = None + remaining_flags: list[str] = [] + i = 0 + while i < len(rest): + token = rest[i] + flag, raw_value, consumed = _consume_required_flag_value( + rest, + i, + {"--min", "--max", "--cpu-percent"}, + ) + if not flag or raw_value is None: + return _reject("unsupported_autoscale_flag") + value = _parse_positive_int(raw_value) + if value < 1: + return _reject("autoscale_value_must_be_positive") + if flag == "--min": + min_replicas = value + elif flag == "--max": + max_replicas = value + elif flag == "--cpu-percent": + cpu_percent = value + remaining_flags.extend(rest[i:i + consumed]) + i += consumed + + if min_replicas is None or max_replicas is None: + return _reject("autoscale_min_max_required") + if max_replicas < min_replicas: + return _reject("autoscale_max_below_min") + if cpu_percent is not None and cpu_percent > 100: + return _reject("autoscale_cpu_percent_out_of_range") + + return ParsedKubectlAction( + ok=True, + reason="ok", + kind=ActionKind.AUTOSCALE, + verb="autoscale", + resource_type=resource_type, + resource_name=resource_name, + namespace=namespace, + flags=tuple(namespace_flags + remaining_flags), + ) + + +def _parse_set( + tokens: list[str], + namespace: str | None, + namespace_flags: list[str], +) -> ParsedKubectlAction: + if not tokens or tokens[0] != "resources": + return _reject("unsupported_set_subverb") + resource_type, resource_name, rest = _split_resource_ref(tokens[1:]) + if resource_type not in _SCALABLE_RESOURCES or not resource_name: + return _reject("invalid_set_resources_target") + + saw_resource_flag = False + remaining_flags: list[str] = [] + i = 0 + while i < len(rest): + flag, raw_value, consumed = _consume_required_flag_value( + rest, + i, + {"--limits", "--requests"}, + ) + if not flag or raw_value is None: + return _reject("unsupported_set_resources_flag") + if not _resource_quantity_assignments_safe(raw_value): + return _reject("invalid_resource_quantity") + saw_resource_flag = True + remaining_flags.extend(rest[i:i + consumed]) + i += consumed + + if not saw_resource_flag: + return _reject("set_resources_requires_limits_or_requests") + + return ParsedKubectlAction( + ok=True, + reason="ok", + kind=ActionKind.SET_RESOURCES, + verb="set", + subverb="resources", + resource_type=resource_type, + resource_name=resource_name, + namespace=namespace, + flags=tuple(namespace_flags + remaining_flags), + ) + + def _parse_delete( tokens: list[str], namespace: str | None, @@ -339,6 +459,43 @@ def _parse_positive_int(value: str) -> int: return int(value) +def _consume_required_flag_value( + tokens: list[str], + index: int, + allowed_flags: set[str], +) -> tuple[str | None, str | None, int]: + token = tokens[index] + if "=" in token: + flag, value = token.split("=", 1) + if flag not in allowed_flags or not value: + return None, None, 1 + return flag, value, 1 + + if token not in allowed_flags or index + 1 >= len(tokens): + return None, None, 1 + value = tokens[index + 1] + if not value or value.startswith("-"): + return None, None, 1 + return token, value, 2 + + +def _resource_quantity_assignments_safe(value: str) -> bool: + parts = value.split(",") + if not parts: + return False + for part in parts: + key, separator, quantity = part.partition("=") + if separator != "=": + return False + if key not in {"cpu", "memory"}: + return False + if not quantity or not set(quantity) <= _SAFE_TOKEN_CHARS: + return False + if quantity in {"0", "0m", "0Mi", "0Gi"}: + return False + return True + + def _flags_allowed(tokens: list[str], allowed_flags: set[str]) -> bool: i = 0 while i < len(tokens): diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index 5e3235be..97c3ffb0 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -32,6 +32,7 @@ import structlog import yaml from src.constants.alert_types import ALERTNAME_TO_TYPE +from src.services.action_parser import parse_kubectl_action logger = structlog.get_logger(__name__) @@ -43,19 +44,17 @@ _generating: set[str] = set() # Redis 分散式鎖 TTL (秒),覆蓋 Ollama + Gemini 最長生成時間 _RULE_GEN_LOCK_TTL = 120 -# ── kubectl 注入防護 (Task 2.3, ADR-076, 2026-04-14) ───────── -# 對齊 auto_approve._DESTRUCTIVE_PATTERNS + decision_manager._ALLOWED_KUBECTL_PATTERN -# 目標: 規則 YAML 中的 kubectl_command 在變數替換後若含下列破壞性模式 → 清空並告警 -_RULE_ENGINE_DESTRUCTIVE_RE = re.compile( - r"(kubectl\s+delete\s+(pvc|namespace|statefulset|deployment)" # 破壞性 K8s 刪除 - r"|kubectl\s+(drain|cordon)" # 節點驅逐/封鎖 - r"|--replicas=\s*0\b" # 縮容至零 - r"|rm\s+-[rf]{1,2}\s" # rm -rf - r"|\bdrop\s+(table|database)\b" # SQL 破壞性 DDL - r"|\$\([^)]{0,200}\)" # shell 命令替換 $(...) - r"|`[^`]{0,200}`" # 反引號替換 - r")", - re.IGNORECASE, +# ── action parser 注入防護 (SPF-2, 2026-04-30) ─────────────── +# kubectl 走 structured token parser;非 kubectl 保留簡單 dangerous-fragment +# 掃描,避免舊式巨型 regex 誤殺安全的單一 delete pod / deployment resource forms。 +_RULE_ENGINE_DANGEROUS_FRAGMENTS = ( + "rm -rf", + "rm -f /", + "drop table", + "drop database", + "truncate table", + "$(", + "`", ) # ── kubectl 注入防護 公開 API ─────────────────────────────── @@ -63,7 +62,7 @@ _RULE_ENGINE_DESTRUCTIVE_RE = re.compile( def validate_kubectl_command(command: str) -> bool: """ - kubectl 注入安全驗證(Task 2.3, ADR-076)。 + Action 注入安全驗證(Task 2.3, ADR-076; SPF-2 parser upgrade)。 Returns: True — 指令安全,可執行 @@ -74,18 +73,19 @@ def validate_kubectl_command(command: str) -> bool: - "ssh ..." 開頭 — SSH 層指令,不走 kubectl 路徑 阻擋條件(返回 False): - - kubectl delete pvc/namespace/statefulset/deployment — 破壞性刪除 - - kubectl drain / cordon — 節點驅逐(業務衝擊) - - --replicas=0 — 縮容至零(服務停止) - - rm -rf — 主機層破壞 - - DROP TABLE/DATABASE — SQL 破壞性 DDL - - $(...) 或反引號 — Shell 命令注入 + - kubectl parser 不支援的語法(deployment delete / drain / cordon / + replicas=0 / shell metachar / compound command) + - 非 kubectl 指令內含主機/SQL/command-substitution 危險片段 """ + command = (command or "").strip() if not command: return True - if command.strip().startswith("ssh "): + if command.startswith("ssh "): return True - return not bool(_RULE_ENGINE_DESTRUCTIVE_RE.search(command)) + if command.startswith("kubectl"): + return parse_kubectl_action(command).ok + command_lower = command.lower() + return not any(fragment in command_lower for fragment in _RULE_ENGINE_DANGEROUS_FRAGMENTS) # ── 變數提取 ──────────────────────────────────────────────── diff --git a/apps/api/src/services/auto_approve.py b/apps/api/src/services/auto_approve.py index 0a52f549..a529e479 100644 --- a/apps/api/src/services/auto_approve.py +++ b/apps/api/src/services/auto_approve.py @@ -26,6 +26,7 @@ from typing import Any import structlog from src.models.playbook import Playbook +from src.services.action_parser import parse_kubectl_action from src.services.playbook_rag import PlaybookMatch from src.services.trust_engine import TrustScoreManager, get_trust_manager @@ -105,7 +106,9 @@ _DESTRUCTIVE_PATTERNS: list[str] = [ "replicas=0", # 任何形式的 replicas=0 # --- K8s 刪除操作 --- - "delete pod", # 強制刪除 pod (kubectl delete pod / pods) + "delete pod --all", # 批次刪除 pod + "delete pod -A", # 跨 namespace 刪除 pod + "delete pod --all-namespaces", "delete pods", # 複數形式 "delete deployment", # 刪除 deployment "delete pvc", # 刪除 PVC (資料丟失) @@ -263,20 +266,36 @@ class AutoApprovePolicy: confidence=confidence, ) - # 條件 1b: 破壞性指令攔截 (ADR-070: 2026-04-11 Claude Sonnet 4.6) - # 即使是 low/medium risk,以下操作仍需人工確認 - # 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工 - # M1+C3 修復 2026-04-11 (Code Review): 移至模組常量 + 補全 K8s/Docker 高風險操作 - action_lower = action.lower() - for pattern in _DESTRUCTIVE_PATTERNS: - if pattern in action_lower: + # 條件 1b: structured action parser 安全閘 (SPF-2, 2026-04-30) + # kubectl 指令以 token grammar 判斷,避免 substring regex 誤殺 + # `kubectl delete pod `,同時攔截 delete deployment / + # delete --all / rollout undo / replicas=0 / shell injection。 + action_stripped = action.strip() + action_lower = action_stripped.lower() + kubectl_cmd_raw = str(proposal_data.get("kubectl_command", "") or "").strip() + kubectl_candidate = kubectl_cmd_raw + if not kubectl_candidate and "kubectl" in action_lower: + kubectl_candidate = action_stripped[action_lower.index("kubectl"):].strip() + if kubectl_candidate.lower().startswith("kubectl"): + parsed_action = parse_kubectl_action(kubectl_candidate) + if not parsed_action.ok: return self._reject( reason=AutoApproveReason.CRITICAL_OPERATION, - detail=f"Destructive pattern detected: '{pattern}' in action — requires human approval", + detail=f"kubectl action parser rejected action: {parsed_action.reason} — requires human approval", risk_level=risk_level, trust_score=trust_score, confidence=confidence, ) + else: + for pattern in _DESTRUCTIVE_PATTERNS: + if pattern in action_lower: + return self._reject( + reason=AutoApproveReason.CRITICAL_OPERATION, + detail=f"Destructive pattern detected: '{pattern}' in action — requires human approval", + risk_level=risk_level, + trust_score=trust_score, + confidence=confidence, + ) # 條件 1c: 無可執行指令 → 拒絕自動執行(2026-04-16 ogt + Claude Sonnet 4.6) # 根因:INVALID_TARGET 導致 rule engine 清空 kubectl_command,action 為空 diff --git a/apps/api/tests/test_action_parser_safety.py b/apps/api/tests/test_action_parser_safety.py index 66cb128a..40801b85 100644 --- a/apps/api/tests/test_action_parser_safety.py +++ b/apps/api/tests/test_action_parser_safety.py @@ -18,6 +18,8 @@ from src.services.action_parser import ( "kubectl rollout restart deployment awoooi-api -n awoooi-prod", "kubectl -n awoooi-prod rollout restart deploy/awoooi-api", "kubectl scale deployment awoooi-api --replicas=3 -n awoooi-prod", + "kubectl autoscale deployment awoooi-api --cpu-percent=70 --min=2 --max=5 -n awoooi-prod", + "kubectl set resources deployment/awoooi-api --limits=cpu=2000m,memory=1Gi -n awoooi-prod", "kubectl delete pod awoooi-api-7d6b776f78-4sgjl -n awoooi-prod", "kubectl get pods -n awoooi-prod", "kubectl describe node k3s-node-01", @@ -33,10 +35,14 @@ def test_safe_kubectl_actions_pass(cmd): "kubectl get pods -n prod $(echo injected)", "kubectl rollout restart deployment/$(cat /etc/passwd)", "kubectl rollout restart deployment/awoooi-api; rm -rf / -n prod", + "kubectl rollout undo deployment/awoooi-api -n prod", "kubectl get pods -n prod && curl http://attacker.invalid", "kubectl delete deployment awoooi-api -n awoooi-prod", "kubectl delete pods --all -n awoooi-prod", + "kubectl delete pod awoooi-api-7d6b776f78-4sgjl --force -n awoooi-prod", "kubectl scale deployment awoooi-api --replicas=0 -n awoooi-prod", + "kubectl autoscale deployment awoooi-api --min=5 --max=2 -n awoooi-prod", + "kubectl set resources deployment/awoooi-api --limits=ephemeral-storage=10Gi -n awoooi-prod", "kubectl patch deployment awoooi-api -p spec -n awoooi-prod", "ssh 192.168.0.188 docker restart openclaw", ]) diff --git a/apps/api/tests/test_alert_rule_engine_validation.py b/apps/api/tests/test_alert_rule_engine_validation.py index 5f7ccbf9..a05d523d 100644 --- a/apps/api/tests/test_alert_rule_engine_validation.py +++ b/apps/api/tests/test_alert_rule_engine_validation.py @@ -62,18 +62,18 @@ class TestValidKubectlCommands: """常見合法 kubectl 指令應通過""" assert validate_kubectl_command(cmd) is True - def test_kubectl_exec_with_psql(self): - """kubectl exec 查詢(含 SQL SELECT)→ 通過""" + def test_kubectl_exec_with_psql_is_not_auto_executable(self): + """kubectl exec 可執行任意 shell,必須降級人工""" cmd = ( "kubectl exec -n awoooi-prod deployment/postgresql -- " "psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity;'" ) - assert validate_kubectl_command(cmd) is True + assert validate_kubectl_command(cmd) is False - def test_kubectl_get_with_jq(self): - """kubectl get + pipe → 通過""" + def test_compound_kubectl_get_is_not_auto_executable(self): + """compound shell 指令必須降級人工""" cmd = "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status" - assert validate_kubectl_command(cmd) is True + assert validate_kubectl_command(cmd) is False # ============================================================================= diff --git a/apps/api/tests/test_cs1_auto_execute.py b/apps/api/tests/test_cs1_auto_execute.py index 734fa558..25832df3 100644 --- a/apps/api/tests/test_cs1_auto_execute.py +++ b/apps/api/tests/test_cs1_auto_execute.py @@ -64,7 +64,7 @@ def _run_cs1_block( 回傳 (mock_executor_class, mock_execute_method) """ - from src.services.auto_approve import _DESTRUCTIVE_PATTERNS + from src.services.action_parser import is_safe_kubectl_action mock_exec_instance = MagicMock() if exec_side_effect is not None: @@ -94,7 +94,7 @@ def _run_cs1_block( and analysis_result.confidence >= 0.85 and risk_level != RiskLevel.CRITICAL and _sa_val not in _non_destructive_actions - and not any(p in _cs1_kubectl.lower() for p in _DESTRUCTIVE_PATTERNS) + and is_safe_kubectl_action(_cs1_kubectl) ) if _cs1_can_auto: import asyncio @@ -155,6 +155,15 @@ class TestCS1AutoExecuteConditions: _, mock_exec = _run_cs1_block(analysis, RiskLevel.LOW) mock_exec.execute_approved_action.assert_not_called() + def test_single_delete_pod_executes(self): + """單一 Pod delete 是可恢復操作,parser 不應誤殺""" + analysis = _make_analysis( + confidence=0.90, + kubectl_command="kubectl delete pod api-xxx-yyy -n prod", + ) + _, mock_exec = _run_cs1_block(analysis, RiskLevel.LOW) + mock_exec.execute_approved_action.assert_called_once() + def test_no_action_does_not_execute(self): """suggested_action=NO_ACTION → 不執行""" analysis = _make_analysis( @@ -182,7 +191,7 @@ class TestCS1AutoExecuteFailureDegradation: analysis = _make_analysis(confidence=0.90) # 直接測試條件邏輯,確保例外被吞掉 - from src.services.auto_approve import _DESTRUCTIVE_PATTERNS + from src.services.action_parser import is_safe_kubectl_action _non_destructive_actions = {"NO_ACTION", "INVESTIGATE", "OBSERVE"} _sa_val = analysis.suggested_action.value @@ -192,7 +201,7 @@ class TestCS1AutoExecuteFailureDegradation: and analysis.confidence >= 0.85 and RiskLevel.LOW != RiskLevel.CRITICAL and _sa_val not in _non_destructive_actions - and not any(p in _cs1_kubectl.lower() for p in _DESTRUCTIVE_PATTERNS) + and is_safe_kubectl_action(_cs1_kubectl) ) assert _cs1_can_auto, "前置條件必須為 True 才能測試降級" diff --git a/apps/api/tests/test_cs3_auto_execute.py b/apps/api/tests/test_cs3_auto_execute.py index 9dd9d9b0..dd574405 100644 --- a/apps/api/tests/test_cs3_auto_execute.py +++ b/apps/api/tests/test_cs3_auto_execute.py @@ -35,13 +35,14 @@ def _make_analysis( def _can_auto(analysis, risk_level, patterns): from src.models.approval import RiskLevel + from src.services.action_parser import is_safe_kubectl_action kubectl = (analysis.kubectl_command or "").strip() return ( bool(kubectl) and analysis.confidence >= 0.85 and risk_level != RiskLevel.CRITICAL and "NO_ACTION" not in (analysis.action_title or "") - and not any(p in kubectl.lower() for p in patterns) + and is_safe_kubectl_action(kubectl) ) @@ -78,9 +79,14 @@ class TestCS3AutoExecute: a = _make_analysis(action_title="NO_ACTION: no fix needed") assert _can_auto(a, RiskLevel.MEDIUM, patterns) is False - def test_destructive_delete_blocked(self, patterns): + def test_single_delete_pod_eligible(self, patterns): from src.models.approval import RiskLevel a = _make_analysis(kubectl="kubectl delete pod foo-123") + assert _can_auto(a, RiskLevel.MEDIUM, patterns) is True + + def test_delete_pods_all_blocked(self, patterns): + from src.models.approval import RiskLevel + a = _make_analysis(kubectl="kubectl delete pods --all -n prod") assert _can_auto(a, RiskLevel.MEDIUM, patterns) is False def test_destructive_force_check(self, patterns): diff --git a/apps/api/tests/test_destructive_patterns.py b/apps/api/tests/test_destructive_patterns.py index ba9d1c82..b84e277f 100644 --- a/apps/api/tests/test_destructive_patterns.py +++ b/apps/api/tests/test_destructive_patterns.py @@ -53,16 +53,22 @@ class TestDestructivePatternsBlocked: def test_scale_to_zero_blocked(self, policy): d = policy.evaluate(self._proposal("kubectl scale deployment api --replicas=0")) assert not d.should_auto_approve - assert "Destructive pattern" in d.reason_detail + assert "parser rejected" in d.reason_detail def test_delete_deployment_blocked(self, policy): d = policy.evaluate(self._proposal("kubectl delete deployment api-server")) assert not d.should_auto_approve - assert "Destructive pattern" in d.reason_detail + assert "parser rejected" in d.reason_detail - def test_delete_pod_blocked(self, policy): + def test_delete_pod_allowed_by_parser(self, policy): d = policy.evaluate(self._proposal("kubectl delete pod api-server-abc123")) + assert d.should_auto_approve + assert "Destructive pattern" not in d.reason_detail + + def test_delete_pod_force_blocked(self, policy): + d = policy.evaluate(self._proposal("kubectl delete pod api-server-abc123 --force")) assert not d.should_auto_approve + assert "parser rejected" in d.reason_detail def test_delete_pods_plural_blocked(self, policy): d = policy.evaluate(self._proposal("kubectl delete pods --all -n awoooi-prod")) diff --git a/apps/api/tests/test_rule_engine_auto_execute.py b/apps/api/tests/test_rule_engine_auto_execute.py index 44d989df..62bbe8b4 100644 --- a/apps/api/tests/test_rule_engine_auto_execute.py +++ b/apps/api/tests/test_rule_engine_auto_execute.py @@ -19,6 +19,7 @@ CS2 規則引擎自動執行條件邏輯測試 """ from src.models.approval import RiskLevel +from src.services.action_parser import is_safe_kubectl_action from src.services.auto_approve import _DESTRUCTIVE_PATTERNS @@ -31,11 +32,10 @@ def _evaluate_can_auto( 複製 webhooks.py CS2 路徑的 _can_auto 邏輯,用於單元測試。 任何修改 webhooks.py 邏輯的人,必須同步更新此函數。 """ - _destructive_set = set(p.lower() for p in _DESTRUCTIVE_PATTERNS) return ( bool(rule_kubectl) and rule_risk != RiskLevel.CRITICAL - and not any(p in rule_kubectl.lower() for p in _destructive_set) + and is_safe_kubectl_action(rule_kubectl) and "NO_ACTION" not in rule_action ) @@ -90,12 +90,12 @@ class TestCS2CanAutoConditions: # ── 防線 3:DESTRUCTIVE_PATTERNS ──────────────────────────────────── - def test_delete_pod_returns_false(self): + def test_single_delete_pod_returns_true(self): assert _evaluate_can_auto( rule_kubectl="kubectl delete pod api-xxx-yyy -n prod", rule_risk=RiskLevel.LOW, rule_action="刪除 Pod | kubectl delete pod api-xxx-yyy -n prod", - ) is False + ) is True def test_delete_pods_returns_false(self): assert _evaluate_can_auto( @@ -104,6 +104,13 @@ class TestCS2CanAutoConditions: rule_action="刪除所有 Pod | kubectl delete pods --all -n prod", ) is False + def test_delete_pod_force_returns_false(self): + assert _evaluate_can_auto( + rule_kubectl="kubectl delete pod api-xxx-yyy --force -n prod", + rule_risk=RiskLevel.LOW, + rule_action="強制刪除 Pod | kubectl delete pod api-xxx-yyy --force -n prod", + ) is False + def test_scale_to_zero_returns_false(self): assert _evaluate_can_auto( rule_kubectl="kubectl scale deployment/api --replicas=0 -n prod", diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index d70c5628..c2a9550a 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,19 @@ --- +## 2026-04-30 | SPF-2 action parser 收斂 — 告警自動修復安全閘 + +承接 Wave A「告警→自動修復」阻塞點,將 CS1/CS2/CS3 自動執行路徑從 substring destructive patterns 收斂到 structured kubectl action parser。 + +### 完成 +- `action_parser.py` 擴充安全語法:rollout restart、scale 正整數、autoscale 正 min/max、set resources CPU/memory、單一 Pod delete、read-only get/describe/logs/top/version。 +- `webhooks.py` CS1 / CS2 / CS3 全部改用 `is_safe_kubectl_action()`,避免 `_DESTRUCTIVE_PATTERNS` 誤殺 `kubectl delete pod `。 +- `auto_approve.py` kubectl action 先走 parser,非 kubectl / SSH 再走 legacy dangerous fragments;`delete pod --all`、`delete deployment`、`rollout undo`、`replicas=0`、shell injection 仍阻擋。 +- `alert_rule_engine.validate_kubectl_command()` 由巨型 regex 改為 parser-backed gate,compound shell / `kubectl exec` 自動降級人工。 + +### 驗證 +- `PYTHONPATH=apps/api python3 -m pytest apps/api/tests/test_action_parser_safety.py apps/api/tests/test_alert_rule_engine_validation.py apps/api/tests/test_rule_engine_auto_execute.py apps/api/tests/test_cs3_auto_execute.py apps/api/tests/test_cs1_auto_execute.py apps/api/tests/test_destructive_patterns.py -q` → 123 passed。 + ## 2026-04-30 | CD Runner 拆段 — host build/deploy 承接 `RWLayer ... unexpectedly nil` 持續打斷 Gitea CD 的問題。第一層 `capacity: 1` + Docker lock 可阻止跨 repo 並行,但長時間 Web build 仍會讓 transient act job container 在 build 收尾消失。