fix(happy-path): Happy Path 全境加固 — INVALID_TARGET + critical NO_ACTION + 空指令攔截

問題 1 (P0) — deployment/unknown 無效重啟： - alert_rule_engine: 追蹤 _invalid_target flag，回傳 blocked_reason="INVALID_TARGET-..." - decision_manager: auto_execute 路徑偵測 INVALID_TARGET → 提早返回 + TYPE-4 人工確認 - auto_approve: 新增條件 1c — action 為空字串直接拒絕，防止誤報「即將執行」問題 2 (P1) — critical+NO_ACTION 靜默： - decision_manager: blocked_reason 感知層重構 ① INVALID_TARGET → TYPE-4 ② NO_ACTION + critical → TYPE-4（升級，SRE 不可錯過） ③ NO_ACTION + 非 critical → TYPE-1（維持純資訊卡）問題 3 (P1) — 規則匹配信心黑洞： - auto_approve 條件 1c 確保空 action 不通過 auto-approve 即便 is_rule_based=True 也無法在無指令時自動執行 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 22:57:50 +08:00
parent 0077ff9758
commit 83ab5e32d7
3 changed files with 56 additions and 7 deletions
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -372,6 +372,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
    # GAP-A4 (2026-04-14 Claude Sonnet 4.6): 後置驗證 — 垃圾 target 丟棄 command
    # 避免 `kubectl rollout restart deployment unknown/HostHighCpuLoad/...` 這類無效指令
    # 清空 kubectl_command 讓 decision_manager 降級給 LLM 處理
+    _invalid_target = False
    if kubectl_command and _is_bad_target(vars["target"], alertname):
        logger.warning(
            "rule_kubectl_command_discarded_bad_target",
@@ -382,6 +383,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
            original_command=kubectl_command[:120],
        )
        kubectl_command = ""
+        _invalid_target = True

    # 還有 {var} 殘留 → 模板變數未被 _fill 填滿（可能 vars 缺少對應 key）
    if kubectl_command and ("{" in kubectl_command or "}" in kubectl_command):
@@ -391,6 +393,15 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
            command=kubectl_command[:120],
        )
        kubectl_command = ""
+        _invalid_target = True
+
+    # 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 讓 decision_manager 感知 target 失敗
+    # 根因：rule engine 清空 kubectl_command 但不告訴下游原因 → decision_manager 繼續嘗試執行
+    # 修復：設定 blocked_reason="INVALID_TARGET - ..." 讓 auto_execute 路徑提早返回 + TYPE-4
+    _blocked_reason_out = (
+        f"INVALID_TARGET - target='{vars['target']}' 未解析為有效 Deployment，需 SRE 人工確認"
+        if _invalid_target else ""
+    )

    return {
        "rule_id": matched_rule["id"],
@@ -416,6 +427,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
        "confidence": 0.0,  # 🔴 規則匹配固定 0.0，禁止偽造
        "affected_services": [vars["target"]],
        "signoz_correlation": "",
+        "blocked_reason": _blocked_reason_out,  # "" 正常 | "INVALID_TARGET - ..." 目標解析失敗
    }


--- a/apps/api/src/services/auto_approve.py
+++ b/apps/api/src/services/auto_approve.py
@@ -277,6 +277,19 @@ class AutoApprovePolicy:
                    confidence=confidence,
                )

+        # 條件 1c: 無可執行指令 → 拒絕自動執行（2026-04-16 ogt + Claude Sonnet 4.6）
+        # 根因：INVALID_TARGET 導致 rule engine 清空 kubectl_command，action 為空
+        #       原本繼續走 auto_approve 流程，系統誤報「即將執行」但實際無指令
+        # 修復：action 為空字串時直接拒絕，強制 SRE 人工確認
+        if not action.strip():
+            return self._reject(
+                reason=AutoApproveReason.NO_PLAYBOOK,
+                detail="No executable action/kubectl_command — INVALID_TARGET or NO_ACTION, requires human review",
+                risk_level=risk_level,
+                trust_score=trust_score,
+                confidence=confidence,
+            )
+
        # 條件 2: 風險等級必須在允許列表中
        if risk_level not in self.config.allowed_risk_levels:
            return self._reject(
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -263,14 +263,19 @@ async def _push_decision_to_telegram(
        _notification_type = getattr(incident, "notification_type", "") or (_notif_type.value if _notif_type else "")
        _alertname = incident.signals[0].labels.get("alertname", "MetaSystemAlert") if incident.signals else "MetaSystemAlert"

-        # 2026-04-16 ogt + Claude Sonnet 4.6: YAML NO_ACTION → TYPE-1（資訊通知，無按鈕）
-        # 根因：YAML 規則標記 NO_ACTION（如 host_resource/postgresql_disk_monitoring）
-        #       但 classify_notification() 不知道 NO_ACTION → 判為 TYPE-4（confidence 不足）
-        #       → SRE 看到審核按鈕卻沒有任何可執行動作，完全無意義
-        # 修復：偵測 blocked_reason 含 "NO_ACTION" → 強制 TYPE-1（純資訊，告知 SRE 即可）
+        # 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 感知層
+        # ① INVALID_TARGET → TYPE-4：系統無法識別告警目標，SRE 需人工調查
+        # ② NO_ACTION + critical → TYPE-4：critical 事件不能靜默，即便 AI 無法自動修復
+        # ③ NO_ACTION + non-critical → TYPE-1：一般資訊告警，純資訊卡即可
        _blocked_reason = proposal_data.get("blocked_reason", "")
-        if "NO_ACTION" in _blocked_reason:
-            _notif_type = NotificationType.TYPE_1
+        if "INVALID_TARGET" in _blocked_reason:
+            _notif_type = NotificationType.TYPE_4
+        elif "NO_ACTION" in _blocked_reason:
+            if risk_level == "critical":
+                # critical + NO_ACTION: SRE 必須知道（TYPE-4 AI 無法判斷），不可靜默為 TYPE-1
+                _notif_type = NotificationType.TYPE_4
+            else:
+                _notif_type = NotificationType.TYPE_1

        # 2026-04-12 ogt: classify_alert_early() 設的 notification_type 優先於 classify_notification()
        # 場景：backup/info 告警被 classify_notification() 誤判為 TYPE-3（confidence=0, 無 auto_executed）
@@ -1437,6 +1442,25 @@ class DecisionManager:
                        await self._save_token(token)
                        _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
                        return
+
+                    # 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
+                    # 根因：target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason
+                    # 系統不應繼續嘗試執行，提早返回讓 SRE 介入
+                    _yaml_blocked = _yaml_r.get("blocked_reason", "")
+                    if "INVALID_TARGET" in _yaml_blocked:
+                        logger.warning(
+                            "auto_execute_yaml_invalid_target",
+                            incident_id=incident.incident_id,
+                            alertname=_alertname_for_yaml,
+                            blocked_reason=_yaml_blocked,
+                        )
+                        token.state = DecisionState.READY
+                        token.proposal_data["auto_executed"] = False
+                        token.proposal_data["blocked_reason"] = _yaml_blocked
+                        await self._save_token(token)
+                        _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
+                        return
+
                    _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
                    if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
                        # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action