fix(happy-path): Happy Path 全境加固 — INVALID_TARGET + critical NO_ACTION + 空指令攔截
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
問題 1 (P0) — deployment/unknown 無效重啟: - alert_rule_engine: 追蹤 _invalid_target flag,回傳 blocked_reason="INVALID_TARGET-..." - decision_manager: auto_execute 路徑偵測 INVALID_TARGET → 提早返回 + TYPE-4 人工確認 - auto_approve: 新增條件 1c — action 為空字串直接拒絕,防止誤報「即將執行」 問題 2 (P1) — critical+NO_ACTION 靜默: - decision_manager: blocked_reason 感知層重構 ① INVALID_TARGET → TYPE-4 ② NO_ACTION + critical → TYPE-4(升級,SRE 不可錯過) ③ NO_ACTION + 非 critical → TYPE-1(維持純資訊卡) 問題 3 (P1) — 規則匹配信心黑洞: - auto_approve 條件 1c 確保空 action 不通過 auto-approve 即便 is_rule_based=True 也無法在無指令時自動執行 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -372,6 +372,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
# GAP-A4 (2026-04-14 Claude Sonnet 4.6): 後置驗證 — 垃圾 target 丟棄 command
|
||||
# 避免 `kubectl rollout restart deployment unknown/HostHighCpuLoad/...` 這類無效指令
|
||||
# 清空 kubectl_command 讓 decision_manager 降級給 LLM 處理
|
||||
_invalid_target = False
|
||||
if kubectl_command and _is_bad_target(vars["target"], alertname):
|
||||
logger.warning(
|
||||
"rule_kubectl_command_discarded_bad_target",
|
||||
@@ -382,6 +383,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
original_command=kubectl_command[:120],
|
||||
)
|
||||
kubectl_command = ""
|
||||
_invalid_target = True
|
||||
|
||||
# 還有 {var} 殘留 → 模板變數未被 _fill 填滿(可能 vars 缺少對應 key)
|
||||
if kubectl_command and ("{" in kubectl_command or "}" in kubectl_command):
|
||||
@@ -391,6 +393,15 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
command=kubectl_command[:120],
|
||||
)
|
||||
kubectl_command = ""
|
||||
_invalid_target = True
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 讓 decision_manager 感知 target 失敗
|
||||
# 根因:rule engine 清空 kubectl_command 但不告訴下游原因 → decision_manager 繼續嘗試執行
|
||||
# 修復:設定 blocked_reason="INVALID_TARGET - ..." 讓 auto_execute 路徑提早返回 + TYPE-4
|
||||
_blocked_reason_out = (
|
||||
f"INVALID_TARGET - target='{vars['target']}' 未解析為有效 Deployment,需 SRE 人工確認"
|
||||
if _invalid_target else ""
|
||||
)
|
||||
|
||||
return {
|
||||
"rule_id": matched_rule["id"],
|
||||
@@ -416,6 +427,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
"confidence": 0.0, # 🔴 規則匹配固定 0.0,禁止偽造
|
||||
"affected_services": [vars["target"]],
|
||||
"signoz_correlation": "",
|
||||
"blocked_reason": _blocked_reason_out, # "" 正常 | "INVALID_TARGET - ..." 目標解析失敗
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -277,6 +277,19 @@ class AutoApprovePolicy:
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
# 條件 1c: 無可執行指令 → 拒絕自動執行(2026-04-16 ogt + Claude Sonnet 4.6)
|
||||
# 根因:INVALID_TARGET 導致 rule engine 清空 kubectl_command,action 為空
|
||||
# 原本繼續走 auto_approve 流程,系統誤報「即將執行」但實際無指令
|
||||
# 修復:action 為空字串時直接拒絕,強制 SRE 人工確認
|
||||
if not action.strip():
|
||||
return self._reject(
|
||||
reason=AutoApproveReason.NO_PLAYBOOK,
|
||||
detail="No executable action/kubectl_command — INVALID_TARGET or NO_ACTION, requires human review",
|
||||
risk_level=risk_level,
|
||||
trust_score=trust_score,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
# 條件 2: 風險等級必須在允許列表中
|
||||
if risk_level not in self.config.allowed_risk_levels:
|
||||
return self._reject(
|
||||
|
||||
@@ -263,14 +263,19 @@ async def _push_decision_to_telegram(
|
||||
_notification_type = getattr(incident, "notification_type", "") or (_notif_type.value if _notif_type else "")
|
||||
_alertname = incident.signals[0].labels.get("alertname", "MetaSystemAlert") if incident.signals else "MetaSystemAlert"
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: YAML NO_ACTION → TYPE-1(資訊通知,無按鈕)
|
||||
# 根因:YAML 規則標記 NO_ACTION(如 host_resource/postgresql_disk_monitoring)
|
||||
# 但 classify_notification() 不知道 NO_ACTION → 判為 TYPE-4(confidence 不足)
|
||||
# → SRE 看到審核按鈕卻沒有任何可執行動作,完全無意義
|
||||
# 修復:偵測 blocked_reason 含 "NO_ACTION" → 強制 TYPE-1(純資訊,告知 SRE 即可)
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 感知層
|
||||
# ① INVALID_TARGET → TYPE-4:系統無法識別告警目標,SRE 需人工調查
|
||||
# ② NO_ACTION + critical → TYPE-4:critical 事件不能靜默,即便 AI 無法自動修復
|
||||
# ③ NO_ACTION + non-critical → TYPE-1:一般資訊告警,純資訊卡即可
|
||||
_blocked_reason = proposal_data.get("blocked_reason", "")
|
||||
if "NO_ACTION" in _blocked_reason:
|
||||
_notif_type = NotificationType.TYPE_1
|
||||
if "INVALID_TARGET" in _blocked_reason:
|
||||
_notif_type = NotificationType.TYPE_4
|
||||
elif "NO_ACTION" in _blocked_reason:
|
||||
if risk_level == "critical":
|
||||
# critical + NO_ACTION: SRE 必須知道(TYPE-4 AI 無法判斷),不可靜默為 TYPE-1
|
||||
_notif_type = NotificationType.TYPE_4
|
||||
else:
|
||||
_notif_type = NotificationType.TYPE_1
|
||||
|
||||
# 2026-04-12 ogt: classify_alert_early() 設的 notification_type 優先於 classify_notification()
|
||||
# 場景:backup/info 告警被 classify_notification() 誤判為 TYPE-3(confidence=0, 無 auto_executed)
|
||||
@@ -1437,6 +1442,25 @@ class DecisionManager:
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
|
||||
# 根因:target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason
|
||||
# 系統不應繼續嘗試執行,提早返回讓 SRE 介入
|
||||
_yaml_blocked = _yaml_r.get("blocked_reason", "")
|
||||
if "INVALID_TARGET" in _yaml_blocked:
|
||||
logger.warning(
|
||||
"auto_execute_yaml_invalid_target",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
blocked_reason=_yaml_blocked,
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["blocked_reason"] = _yaml_blocked
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
|
||||
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
|
||||
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action
|
||||
|
||||
Reference in New Issue
Block a user