fix(aiops): fallback and escalate automation blockers
This commit is contained in:
@@ -180,6 +180,7 @@ async def _analyze_and_notify(report: DriftReport) -> None:
|
||||
|
||||
# 2026-04-24: 嘗試低風險自動採納
|
||||
auto_adopted = False
|
||||
auto_block_reason = ""
|
||||
try:
|
||||
adopt_svc = get_drift_adopt_service()
|
||||
auto_result = await adopt_svc.auto_adopt_if_safe(report)
|
||||
@@ -197,19 +198,28 @@ async def _analyze_and_notify(report: DriftReport) -> None:
|
||||
pr_url=auto_result.get("pr_url"),
|
||||
)
|
||||
else:
|
||||
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
|
||||
_logger.info(
|
||||
"drift_auto_adopt_skipped",
|
||||
report_id=report.report_id,
|
||||
reason=auto_result.get("reason", ""),
|
||||
reason=auto_block_reason,
|
||||
skipped=auto_result.get("skipped", True),
|
||||
)
|
||||
except Exception as e:
|
||||
auto_block_reason = f"auto adopt error: {str(e)[:120]}"
|
||||
_logger.warning("drift_auto_adopt_error", report_id=report.report_id, error=str(e))
|
||||
|
||||
if auto_adopted:
|
||||
# 自動採納成功,Telegram 通知已在 auto_adopt_if_safe 內發出,不再推送按鈕卡片
|
||||
return
|
||||
|
||||
if auto_block_reason:
|
||||
await _escalate_drift_auto_adopt_blocked(
|
||||
report=report,
|
||||
reason=auto_block_reason,
|
||||
interpretation=interpretation,
|
||||
)
|
||||
|
||||
# ADR-075: drift_narrator_service 負責發送 TYPE-4D 卡片(含按鈕)
|
||||
# 舊的 send_text() 已移除,改由 narrate_and_notify() 統一處理
|
||||
try:
|
||||
@@ -224,6 +234,25 @@ async def _analyze_and_notify(report: DriftReport) -> None:
|
||||
structlog.get_logger(__name__).error("drift_analyze_notify_failed", error=str(e))
|
||||
|
||||
|
||||
async def _escalate_drift_auto_adopt_blocked(
|
||||
*,
|
||||
report: DriftReport,
|
||||
reason: str,
|
||||
interpretation,
|
||||
) -> None:
|
||||
"""Delegate drift emergency escalation to the service layer."""
|
||||
|
||||
from src.services.emergency_escalation_service import (
|
||||
escalate_drift_auto_adopt_blocked,
|
||||
)
|
||||
|
||||
await escalate_drift_auto_adopt_blocked(
|
||||
report=report,
|
||||
reason=reason,
|
||||
interpretation=interpretation,
|
||||
)
|
||||
|
||||
|
||||
async def _run_full_scan(namespaces: list[str]) -> None:
|
||||
"""背景:完整漂移掃描"""
|
||||
detector = get_drift_detector()
|
||||
|
||||
@@ -110,6 +110,34 @@ def _should_bypass_alertmanager_llm(
|
||||
and alert_category == "host_resource"
|
||||
)
|
||||
|
||||
|
||||
async def _escalate_auto_repair_unavailable(
|
||||
*,
|
||||
incident_id: str,
|
||||
approval_id: str,
|
||||
alert_type: str,
|
||||
target_resource: str,
|
||||
namespace: str,
|
||||
failure_reason: str,
|
||||
attempted_actions: str,
|
||||
) -> None:
|
||||
"""Delegate automation-blocker escalation to the service layer."""
|
||||
|
||||
from src.services.emergency_escalation_service import (
|
||||
escalate_auto_repair_unavailable,
|
||||
)
|
||||
|
||||
await escalate_auto_repair_unavailable(
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
failure_reason=failure_reason,
|
||||
attempted_actions=attempted_actions,
|
||||
)
|
||||
|
||||
|
||||
async def _try_auto_repair_background(
|
||||
incident_id: str,
|
||||
approval_id: str,
|
||||
@@ -165,6 +193,15 @@ async def _try_auto_repair_background(
|
||||
"playbook_id": decision.playbook.playbook_id if decision.playbook else None,
|
||||
},
|
||||
)
|
||||
await _escalate_auto_repair_unavailable(
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
failure_reason=decision.reason,
|
||||
attempted_actions=f"evaluate_auto_repair -> blocked:{decision.blocked_by}",
|
||||
)
|
||||
return
|
||||
|
||||
# 記錄自動修復觸發 (Sprint 5.1 Q10: 加入 Langfuse trace_id 追蹤)
|
||||
@@ -228,6 +265,16 @@ async def _try_auto_repair_background(
|
||||
"namespace": namespace,
|
||||
},
|
||||
)
|
||||
if not result.success:
|
||||
await _escalate_auto_repair_unavailable(
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
failure_reason=result.error or "auto repair execution failed",
|
||||
attempted_actions=f"execute_auto_repair playbook:{result.playbook_id}",
|
||||
)
|
||||
|
||||
# ADR-073 Phase 2-3: 自動修復結果 → 寫入 Incident outcome (2026-04-12 ogt)
|
||||
# 讓 KMConversionService 可依 outcome 判斷是否為 EXECUTION_SUCCESS
|
||||
|
||||
Reference in New Issue
Block a user