diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 322e9242..3deb040f 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -263,41 +263,16 @@ async def _push_decision_to_telegram( _notification_type = getattr(incident, "notification_type", "") or (_notif_type.value if _notif_type else "") _alertname = incident.signals[0].labels.get("alertname", "MetaSystemAlert") if incident.signals else "MetaSystemAlert" - # 2026-04-16 ogt + Claude Sonnet 4.6: blocked_reason 感知層 - # ① INVALID_TARGET → TYPE-4:系統無法識別告警目標,SRE 需人工調查 - # ② NO_ACTION + critical → TYPE-4:critical 事件不能靜默,即便 AI 無法自動修復 - # ③ NO_ACTION + non-critical → TYPE-1:一般資訊告警,純資訊卡即可 + # blocked_reason 由上游「決策路由閘門」統一設定(decide() 第 4c 步) + # 此處只做狀態轉譯,通知層禁止查詢業務邏輯(架構師鐵律 2026-04-17) + # ① INVALID_TARGET → TYPE-4:目標無法解析,SRE 需人工調查 + # ② NO_ACTION + critical → TYPE-4:critical 事件不可靜默 + # ③ NO_ACTION + non-critical → TYPE-1:純資訊卡 _blocked_reason = proposal_data.get("blocked_reason", "") - - # 2026-04-17 ogt + Claude Sonnet 4.6: Inline YAML NO_ACTION 補查 - # 根因:Phase 2 路徑(agent debate → auto_approve 拒絕 → 直接推 TG)不經過 - # auto_execute() 的 YAML check(lines 1427-1439),Coordinator 不設 blocked_reason。 - # 修復:在 _push_decision_to_telegram 內做一次 alertname 查詢,任何路徑都能偵測 NO_ACTION。 - if not _blocked_reason and incident.signals: - try: - from src.services.alert_rule_engine import match_rule as _notif_match - _notif_alertname = incident.signals[0].labels.get("alertname", "") - if _notif_alertname: - _notif_rule = _notif_match({ - "labels": incident.signals[0].labels, - "alert_type": _notif_alertname, - "message": "", - "target_resource": target, - "namespace": "awoooi-prod", - }) - if _notif_rule: - if _notif_rule.get("blocked_reason", "").startswith("INVALID_TARGET"): - _blocked_reason = _notif_rule["blocked_reason"] - elif _notif_rule.get("suggested_action") == "NO_ACTION": - _blocked_reason = f"YAML: NO_ACTION for {_notif_alertname}" - except Exception: - pass # 查詢失敗不影響主流程 - if "INVALID_TARGET" in _blocked_reason: _notif_type = NotificationType.TYPE_4 elif "NO_ACTION" in _blocked_reason: if risk_level == "critical": - # critical + NO_ACTION: SRE 必須知道(TYPE-4 AI 無法判斷),不可靜默為 TYPE-1 _notif_type = NotificationType.TYPE_4 else: _notif_type = NotificationType.TYPE_1 @@ -1389,6 +1364,57 @@ class DecisionManager: self._persist_decision_to_db(incident.incident_id, token.proposal_data) ) + # 4c. YAML NO_ACTION 閘門 — 決策路由中樞(2026-04-17 ogt + Claude Sonnet 4.6) + # 架構原則:通知層(_push_decision_to_telegram)只做狀態轉譯,不查業務邏輯。 + # 因此 NO_ACTION / INVALID_TARGET 判斷必須在此閘門統一完成,任何路徑都必須通過。 + # + # 根因(先前盲點):Phase 2 路徑拒絕後直接推 TG,不經 auto_execute() 的 YAML check, + # Coordinator 不設 blocked_reason → TG 收到空 blocked_reason → 無法正確分類通知類型。 + # 修復:在 auto_approve 之前,統一查詢 YAML 規則: + # NO_ACTION → 標記 blocked_reason + is_informational_only → 短路跳過 auto_approve + # INVALID_TARGET → 標記 blocked_reason → 短路,TYPE-4 由 TG 層轉譯 + if token.state == DecisionState.READY and token.proposal_data and incident.signals: + _gate_alertname = incident.signals[0].labels.get("alertname", "") + if _gate_alertname and not token.proposal_data.get("blocked_reason", ""): + try: + from src.services.alert_rule_engine import match_rule as _gate_match + _gate_r = _gate_match({ + "labels": incident.signals[0].labels, + "alert_type": _gate_alertname, + "message": "", + "target_resource": ( + incident.affected_services[0] + if incident.affected_services else "unknown" + ), + "namespace": "awoooi-prod", + }) + if _gate_r: + _gate_blocked = _gate_r.get("blocked_reason", "") + _gate_action = _gate_r.get("suggested_action", "") + if "INVALID_TARGET" in _gate_blocked or _gate_action == "NO_ACTION": + # 標記 blocked_reason,讓 TG 層正確轉譯通知類型 + if "INVALID_TARGET" in _gate_blocked: + token.proposal_data["blocked_reason"] = _gate_blocked + else: + token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_gate_alertname}" + token.proposal_data["is_informational_only"] = True + token.proposal_data["auto_executed"] = False + token.proposal_data["decision_state"] = token.state.value + await self._save_token(token) + _fire_and_forget( + _push_decision_to_telegram(incident, token.proposal_data) + ) + logger.info( + "yaml_gate_short_circuit", + incident_id=incident.incident_id, + alertname=_gate_alertname, + blocked_reason=token.proposal_data["blocked_reason"], + ) + return token # 短路 — 跳過 auto_approve + Blast Radius 評估 + except Exception as _gate_err: + logger.debug("yaml_gate_error", error=str(_gate_err)) + # 閘門查詢失敗 → 降級繼續正常流程(不阻塞) + # 5. ADR-030 Phase 4: 自動執行判斷 if token.state == DecisionState.READY and token.proposal_data: # 評估是否可以自動執行