fix(happy-path): Happy Path 全境加固 — INVALID_TARGET + critical NO_ACTION + 空指令攔截
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

問題 1 (P0) — deployment/unknown 無效重啟:
- alert_rule_engine: 追蹤 _invalid_target flag,回傳 blocked_reason="INVALID_TARGET-..."
- decision_manager: auto_execute 路徑偵測 INVALID_TARGET → 提早返回 + TYPE-4 人工確認
- auto_approve: 新增條件 1c — action 為空字串直接拒絕,防止誤報「即將執行」

問題 2 (P1) — critical+NO_ACTION 靜默:
- decision_manager: blocked_reason 感知層重構
  ① INVALID_TARGET → TYPE-4
  ② NO_ACTION + critical → TYPE-4(升級,SRE 不可錯過)
  ③ NO_ACTION + 非 critical → TYPE-1(維持純資訊卡)

問題 3 (P1) — 規則匹配信心黑洞:
- auto_approve 條件 1c 確保空 action 不通過 auto-approve
  即便 is_rule_based=True 也無法在無指令時自動執行

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-16 22:57:50 +08:00
parent 0077ff9758
commit 83ab5e32d7
3 changed files with 56 additions and 7 deletions

View File

@@ -372,6 +372,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
# GAP-A4 (2026-04-14 Claude Sonnet 4.6): 後置驗證 — 垃圾 target 丟棄 command
# 避免 `kubectl rollout restart deployment unknown/HostHighCpuLoad/...` 這類無效指令
# 清空 kubectl_command 讓 decision_manager 降級給 LLM 處理
_invalid_target = False
if kubectl_command and _is_bad_target(vars["target"], alertname):
logger.warning(
"rule_kubectl_command_discarded_bad_target",
@@ -382,6 +383,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
original_command=kubectl_command[:120],
)
kubectl_command = ""
_invalid_target = True
# 還有 {var} 殘留 → 模板變數未被 _fill 填滿(可能 vars 缺少對應 key
if kubectl_command and ("{" in kubectl_command or "}" in kubectl_command):
@@ -391,6 +393,15 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
command=kubectl_command[:120],
)
kubectl_command = ""
_invalid_target = True
# 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 讓 decision_manager 感知 target 失敗
# 根因rule engine 清空 kubectl_command 但不告訴下游原因 → decision_manager 繼續嘗試執行
# 修復:設定 blocked_reason="INVALID_TARGET - ..." 讓 auto_execute 路徑提早返回 + TYPE-4
_blocked_reason_out = (
f"INVALID_TARGET - target='{vars['target']}' 未解析為有效 Deployment需 SRE 人工確認"
if _invalid_target else ""
)
return {
"rule_id": matched_rule["id"],
@@ -416,6 +427,7 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
"confidence": 0.0, # 🔴 規則匹配固定 0.0,禁止偽造
"affected_services": [vars["target"]],
"signoz_correlation": "",
"blocked_reason": _blocked_reason_out, # "" 正常 | "INVALID_TARGET - ..." 目標解析失敗
}

View File

@@ -277,6 +277,19 @@ class AutoApprovePolicy:
confidence=confidence,
)
# 條件 1c: 無可執行指令 → 拒絕自動執行2026-04-16 ogt + Claude Sonnet 4.6
# 根因INVALID_TARGET 導致 rule engine 清空 kubectl_commandaction 為空
# 原本繼續走 auto_approve 流程,系統誤報「即將執行」但實際無指令
# 修復action 為空字串時直接拒絕,強制 SRE 人工確認
if not action.strip():
return self._reject(
reason=AutoApproveReason.NO_PLAYBOOK,
detail="No executable action/kubectl_command — INVALID_TARGET or NO_ACTION, requires human review",
risk_level=risk_level,
trust_score=trust_score,
confidence=confidence,
)
# 條件 2: 風險等級必須在允許列表中
if risk_level not in self.config.allowed_risk_levels:
return self._reject(

View File

@@ -263,14 +263,19 @@ async def _push_decision_to_telegram(
_notification_type = getattr(incident, "notification_type", "") or (_notif_type.value if _notif_type else "")
_alertname = incident.signals[0].labels.get("alertname", "MetaSystemAlert") if incident.signals else "MetaSystemAlert"
# 2026-04-16 ogt + Claude Sonnet 4.6: YAML NO_ACTION → TYPE-1資訊通知無按鈕
# 根因YAML 規則標記 NO_ACTION如 host_resource/postgresql_disk_monitoring
# 但 classify_notification() 不知道 NO_ACTION → 判為 TYPE-4confidence 不足)
# → SRE 看到審核按鈕卻沒有任何可執行動作,完全無意義
# 修復:偵測 blocked_reason 含 "NO_ACTION" → 強制 TYPE-1純資訊告知 SRE 即可)
# 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 感知層
# ① INVALID_TARGET → TYPE-4系統無法識別告警目標SRE 需人工調查
# ② NO_ACTION + critical → TYPE-4critical 事件不能靜默,即便 AI 無法自動修復
# ③ NO_ACTION + non-critical → TYPE-1一般資訊告警純資訊卡即可
_blocked_reason = proposal_data.get("blocked_reason", "")
if "NO_ACTION" in _blocked_reason:
_notif_type = NotificationType.TYPE_1
if "INVALID_TARGET" in _blocked_reason:
_notif_type = NotificationType.TYPE_4
elif "NO_ACTION" in _blocked_reason:
if risk_level == "critical":
# critical + NO_ACTION: SRE 必須知道TYPE-4 AI 無法判斷),不可靜默為 TYPE-1
_notif_type = NotificationType.TYPE_4
else:
_notif_type = NotificationType.TYPE_1
# 2026-04-12 ogt: classify_alert_early() 設的 notification_type 優先於 classify_notification()
# 場景backup/info 告警被 classify_notification() 誤判為 TYPE-3confidence=0, 無 auto_executed
@@ -1437,6 +1442,25 @@ class DecisionManager:
await self._save_token(token)
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
# 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
# 根因target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason
# 系統不應繼續嘗試執行,提早返回讓 SRE 介入
_yaml_blocked = _yaml_r.get("blocked_reason", "")
if "INVALID_TARGET" in _yaml_blocked:
logger.warning(
"auto_execute_yaml_invalid_target",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
blocked_reason=_yaml_blocked,
)
token.state = DecisionState.READY
token.proposal_data["auto_executed"] = False
token.proposal_data["blocked_reason"] = _yaml_blocked
await self._save_token(token)
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action