From 83ab5e32d729311e3ff02602f4b425b31454fa35 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 16 Apr 2026 22:57:50 +0800 Subject: [PATCH] =?UTF-8?q?fix(happy-path):=20Happy=20Path=20=E5=85=A8?= =?UTF-8?q?=E5=A2=83=E5=8A=A0=E5=9B=BA=20=E2=80=94=20INVALID=5FTARGET=20+?= =?UTF-8?q?=20critical=20NO=5FACTION=20+=20=E7=A9=BA=E6=8C=87=E4=BB=A4?= =?UTF-8?q?=E6=94=94=E6=88=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題 1 (P0) — deployment/unknown 無效重啟: - alert_rule_engine: 追蹤 _invalid_target flag,回傳 blocked_reason="INVALID_TARGET-..." - decision_manager: auto_execute 路徑偵測 INVALID_TARGET → 提早返回 + TYPE-4 人工確認 - auto_approve: 新增條件 1c — action 為空字串直接拒絕,防止誤報「即將執行」 問題 2 (P1) — critical+NO_ACTION 靜默: - decision_manager: blocked_reason 感知層重構 ① INVALID_TARGET → TYPE-4 ② NO_ACTION + critical → TYPE-4(升級,SRE 不可錯過) ③ NO_ACTION + 非 critical → TYPE-1(維持純資訊卡) 問題 3 (P1) — 規則匹配信心黑洞: - auto_approve 條件 1c 確保空 action 不通過 auto-approve 即便 is_rule_based=True 也無法在無指令時自動執行 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/alert_rule_engine.py | 12 +++++++ apps/api/src/services/auto_approve.py | 13 ++++++++ apps/api/src/services/decision_manager.py | 38 ++++++++++++++++++---- 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index 38a55780..49f5d692 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -372,6 +372,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: # GAP-A4 (2026-04-14 Claude Sonnet 4.6): 後置驗證 — 垃圾 target 丟棄 command # 避免 `kubectl rollout restart deployment unknown/HostHighCpuLoad/...` 這類無效指令 # 清空 kubectl_command 讓 decision_manager 降級給 LLM 處理 + _invalid_target = False if kubectl_command and _is_bad_target(vars["target"], alertname): logger.warning( "rule_kubectl_command_discarded_bad_target", @@ -382,6 +383,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: original_command=kubectl_command[:120], ) kubectl_command = "" + _invalid_target = True # 還有 {var} 殘留 → 模板變數未被 _fill 填滿(可能 vars 缺少對應 key) if kubectl_command and ("{" in kubectl_command or "}" in kubectl_command): @@ -391,6 +393,15 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: command=kubectl_command[:120], ) kubectl_command = "" + _invalid_target = True + + # 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 讓 decision_manager 感知 target 失敗 + # 根因:rule engine 清空 kubectl_command 但不告訴下游原因 → decision_manager 繼續嘗試執行 + # 修復:設定 blocked_reason="INVALID_TARGET - ..." 讓 auto_execute 路徑提早返回 + TYPE-4 + _blocked_reason_out = ( + f"INVALID_TARGET - target='{vars['target']}' 未解析為有效 Deployment,需 SRE 人工確認" + if _invalid_target else "" + ) return { "rule_id": matched_rule["id"], @@ -416,6 +427,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: "confidence": 0.0, # 🔴 規則匹配固定 0.0,禁止偽造 "affected_services": [vars["target"]], "signoz_correlation": "", + "blocked_reason": _blocked_reason_out, # "" 正常 | "INVALID_TARGET - ..." 目標解析失敗 } diff --git a/apps/api/src/services/auto_approve.py b/apps/api/src/services/auto_approve.py index 0dd00144..3d103098 100644 --- a/apps/api/src/services/auto_approve.py +++ b/apps/api/src/services/auto_approve.py @@ -277,6 +277,19 @@ class AutoApprovePolicy: confidence=confidence, ) + # 條件 1c: 無可執行指令 → 拒絕自動執行(2026-04-16 ogt + Claude Sonnet 4.6) + # 根因:INVALID_TARGET 導致 rule engine 清空 kubectl_command,action 為空 + # 原本繼續走 auto_approve 流程,系統誤報「即將執行」但實際無指令 + # 修復:action 為空字串時直接拒絕,強制 SRE 人工確認 + if not action.strip(): + return self._reject( + reason=AutoApproveReason.NO_PLAYBOOK, + detail="No executable action/kubectl_command — INVALID_TARGET or NO_ACTION, requires human review", + risk_level=risk_level, + trust_score=trust_score, + confidence=confidence, + ) + # 條件 2: 風險等級必須在允許列表中 if risk_level not in self.config.allowed_risk_levels: return self._reject( diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 5917de29..52b61a59 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -263,14 +263,19 @@ async def _push_decision_to_telegram( _notification_type = getattr(incident, "notification_type", "") or (_notif_type.value if _notif_type else "") _alertname = incident.signals[0].labels.get("alertname", "MetaSystemAlert") if incident.signals else "MetaSystemAlert" - # 2026-04-16 ogt + Claude Sonnet 4.6: YAML NO_ACTION → TYPE-1(資訊通知,無按鈕) - # 根因:YAML 規則標記 NO_ACTION(如 host_resource/postgresql_disk_monitoring) - # 但 classify_notification() 不知道 NO_ACTION → 判為 TYPE-4(confidence 不足) - # → SRE 看到審核按鈕卻沒有任何可執行動作,完全無意義 - # 修復:偵測 blocked_reason 含 "NO_ACTION" → 強制 TYPE-1(純資訊,告知 SRE 即可) + # 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 感知層 + # ① INVALID_TARGET → TYPE-4:系統無法識別告警目標,SRE 需人工調查 + # ② NO_ACTION + critical → TYPE-4:critical 事件不能靜默,即便 AI 無法自動修復 + # ③ NO_ACTION + non-critical → TYPE-1:一般資訊告警,純資訊卡即可 _blocked_reason = proposal_data.get("blocked_reason", "") - if "NO_ACTION" in _blocked_reason: - _notif_type = NotificationType.TYPE_1 + if "INVALID_TARGET" in _blocked_reason: + _notif_type = NotificationType.TYPE_4 + elif "NO_ACTION" in _blocked_reason: + if risk_level == "critical": + # critical + NO_ACTION: SRE 必須知道(TYPE-4 AI 無法判斷),不可靜默為 TYPE-1 + _notif_type = NotificationType.TYPE_4 + else: + _notif_type = NotificationType.TYPE_1 # 2026-04-12 ogt: classify_alert_early() 設的 notification_type 優先於 classify_notification() # 場景:backup/info 告警被 classify_notification() 誤判為 TYPE-3(confidence=0, 無 auto_executed) @@ -1437,6 +1442,25 @@ class DecisionManager: await self._save_token(token) _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) return + + # 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4 + # 根因:target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason + # 系統不應繼續嘗試執行,提早返回讓 SRE 介入 + _yaml_blocked = _yaml_r.get("blocked_reason", "") + if "INVALID_TARGET" in _yaml_blocked: + logger.warning( + "auto_execute_yaml_invalid_target", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + blocked_reason=_yaml_blocked, + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["blocked_reason"] = _yaml_blocked + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip() if _yaml_cmd and not _yaml_cmd.startswith("kubectl"): # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action