refactor(decision): 狀態機重構 — YAML NO_ACTION 閘門上移至決策路由中樞
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
架構師指令(2026-04-17):通知層禁止查詢業務邏輯。
撤銷 c05bcdb 的 inline YAML 查詢(義大利麵補丁),
將 NO_ACTION / INVALID_TARGET 判斷移至正確位置。
重構方向:
① 移除 _push_decision_to_telegram() 的 inline YAML 查詢
→ 通知層只做 blocked_reason → NotificationType 轉譯(Single Responsibility)
② 新增 decide() 第 4c 步:YAML NO_ACTION 路由閘門
位置:_dual_engine_analyze() 返回後、auto_approve.evaluate() 之前
邏輯:
- NO_ACTION → blocked_reason="YAML: NO_ACTION" + is_informational_only=True
→ 短路跳過 auto_approve + Blast Radius → TYPE-1(或 critical → TYPE-4)
- INVALID_TARGET → blocked_reason="INVALID_TARGET-..." → 短路 → TYPE-4
- 閘門查詢失敗 → 靜默降級,繼續正常流程
Checkpoint 覆蓋:
CP1 上移 YAML 評估層 ✅
CP2 短路跳過 auto_approve ✅
CP3 通知層純粹轉譯 ✅
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -263,41 +263,16 @@ async def _push_decision_to_telegram(
|
||||
_notification_type = getattr(incident, "notification_type", "") or (_notif_type.value if _notif_type else "")
|
||||
_alertname = incident.signals[0].labels.get("alertname", "MetaSystemAlert") if incident.signals else "MetaSystemAlert"
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 感知層
|
||||
# ① INVALID_TARGET → TYPE-4:系統無法識別告警目標,SRE 需人工調查
|
||||
# ② NO_ACTION + critical → TYPE-4:critical 事件不能靜默,即便 AI 無法自動修復
|
||||
# ③ NO_ACTION + non-critical → TYPE-1:一般資訊告警,純資訊卡即可
|
||||
# blocked_reason 由上游「決策路由閘門」統一設定(decide() 第 4c 步)
|
||||
# 此處只做狀態轉譯,通知層禁止查詢業務邏輯(架構師鐵律 2026-04-17)
|
||||
# ① INVALID_TARGET → TYPE-4:目標無法解析,SRE 需人工調查
|
||||
# ② NO_ACTION + critical → TYPE-4:critical 事件不可靜默
|
||||
# ③ NO_ACTION + non-critical → TYPE-1:純資訊卡
|
||||
_blocked_reason = proposal_data.get("blocked_reason", "")
|
||||
|
||||
# 2026-04-17 ogt + Claude Sonnet 4.6: Inline YAML NO_ACTION 補查
|
||||
# 根因:Phase 2 路徑(agent debate → auto_approve 拒絕 → 直接推 TG)不經過
|
||||
# auto_execute() 的 YAML check(lines 1427-1439),Coordinator 不設 blocked_reason。
|
||||
# 修復:在 _push_decision_to_telegram 內做一次 alertname 查詢,任何路徑都能偵測 NO_ACTION。
|
||||
if not _blocked_reason and incident.signals:
|
||||
try:
|
||||
from src.services.alert_rule_engine import match_rule as _notif_match
|
||||
_notif_alertname = incident.signals[0].labels.get("alertname", "")
|
||||
if _notif_alertname:
|
||||
_notif_rule = _notif_match({
|
||||
"labels": incident.signals[0].labels,
|
||||
"alert_type": _notif_alertname,
|
||||
"message": "",
|
||||
"target_resource": target,
|
||||
"namespace": "awoooi-prod",
|
||||
})
|
||||
if _notif_rule:
|
||||
if _notif_rule.get("blocked_reason", "").startswith("INVALID_TARGET"):
|
||||
_blocked_reason = _notif_rule["blocked_reason"]
|
||||
elif _notif_rule.get("suggested_action") == "NO_ACTION":
|
||||
_blocked_reason = f"YAML: NO_ACTION for {_notif_alertname}"
|
||||
except Exception:
|
||||
pass # 查詢失敗不影響主流程
|
||||
|
||||
if "INVALID_TARGET" in _blocked_reason:
|
||||
_notif_type = NotificationType.TYPE_4
|
||||
elif "NO_ACTION" in _blocked_reason:
|
||||
if risk_level == "critical":
|
||||
# critical + NO_ACTION: SRE 必須知道(TYPE-4 AI 無法判斷),不可靜默為 TYPE-1
|
||||
_notif_type = NotificationType.TYPE_4
|
||||
else:
|
||||
_notif_type = NotificationType.TYPE_1
|
||||
@@ -1389,6 +1364,57 @@ class DecisionManager:
|
||||
self._persist_decision_to_db(incident.incident_id, token.proposal_data)
|
||||
)
|
||||
|
||||
# 4c. YAML NO_ACTION 閘門 — 決策路由中樞(2026-04-17 ogt + Claude Sonnet 4.6)
|
||||
# 架構原則:通知層(_push_decision_to_telegram)只做狀態轉譯,不查業務邏輯。
|
||||
# 因此 NO_ACTION / INVALID_TARGET 判斷必須在此閘門統一完成,任何路徑都必須通過。
|
||||
#
|
||||
# 根因(先前盲點):Phase 2 路徑拒絕後直接推 TG,不經 auto_execute() 的 YAML check,
|
||||
# Coordinator 不設 blocked_reason → TG 收到空 blocked_reason → 無法正確分類通知類型。
|
||||
# 修復:在 auto_approve 之前,統一查詢 YAML 規則:
|
||||
# NO_ACTION → 標記 blocked_reason + is_informational_only → 短路跳過 auto_approve
|
||||
# INVALID_TARGET → 標記 blocked_reason → 短路,TYPE-4 由 TG 層轉譯
|
||||
if token.state == DecisionState.READY and token.proposal_data and incident.signals:
|
||||
_gate_alertname = incident.signals[0].labels.get("alertname", "")
|
||||
if _gate_alertname and not token.proposal_data.get("blocked_reason", ""):
|
||||
try:
|
||||
from src.services.alert_rule_engine import match_rule as _gate_match
|
||||
_gate_r = _gate_match({
|
||||
"labels": incident.signals[0].labels,
|
||||
"alert_type": _gate_alertname,
|
||||
"message": "",
|
||||
"target_resource": (
|
||||
incident.affected_services[0]
|
||||
if incident.affected_services else "unknown"
|
||||
),
|
||||
"namespace": "awoooi-prod",
|
||||
})
|
||||
if _gate_r:
|
||||
_gate_blocked = _gate_r.get("blocked_reason", "")
|
||||
_gate_action = _gate_r.get("suggested_action", "")
|
||||
if "INVALID_TARGET" in _gate_blocked or _gate_action == "NO_ACTION":
|
||||
# 標記 blocked_reason,讓 TG 層正確轉譯通知類型
|
||||
if "INVALID_TARGET" in _gate_blocked:
|
||||
token.proposal_data["blocked_reason"] = _gate_blocked
|
||||
else:
|
||||
token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_gate_alertname}"
|
||||
token.proposal_data["is_informational_only"] = True
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["decision_state"] = token.state.value
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(
|
||||
_push_decision_to_telegram(incident, token.proposal_data)
|
||||
)
|
||||
logger.info(
|
||||
"yaml_gate_short_circuit",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_gate_alertname,
|
||||
blocked_reason=token.proposal_data["blocked_reason"],
|
||||
)
|
||||
return token # 短路 — 跳過 auto_approve + Blast Radius 評估
|
||||
except Exception as _gate_err:
|
||||
logger.debug("yaml_gate_error", error=str(_gate_err))
|
||||
# 閘門查詢失敗 → 降級繼續正常流程(不阻塞)
|
||||
|
||||
# 5. ADR-030 Phase 4: 自動執行判斷
|
||||
if token.state == DecisionState.READY and token.proposal_data:
|
||||
# 評估是否可以自動執行
|
||||
|
||||
Reference in New Issue
Block a user