fix(aiops): fallback and escalate automation blockers
Some checks failed
CD Pipeline / tests (push) Successful in 2m41s
Code Review / ai-code-review (push) Successful in 24s
CD Pipeline / build-and-deploy (push) Successful in 7m51s
CD Pipeline / post-deploy-checks (push) Failing after 2m15s

This commit is contained in:
Your Name
2026-04-30 14:13:57 +08:00
parent 82649c2cbb
commit 80defbed7c
10 changed files with 311 additions and 10 deletions

View File

@@ -180,6 +180,7 @@ async def _analyze_and_notify(report: DriftReport) -> None:
# 2026-04-24: 嘗試低風險自動採納
auto_adopted = False
auto_block_reason = ""
try:
adopt_svc = get_drift_adopt_service()
auto_result = await adopt_svc.auto_adopt_if_safe(report)
@@ -197,19 +198,28 @@ async def _analyze_and_notify(report: DriftReport) -> None:
pr_url=auto_result.get("pr_url"),
)
else:
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
_logger.info(
"drift_auto_adopt_skipped",
report_id=report.report_id,
reason=auto_result.get("reason", ""),
reason=auto_block_reason,
skipped=auto_result.get("skipped", True),
)
except Exception as e:
auto_block_reason = f"auto adopt error: {str(e)[:120]}"
_logger.warning("drift_auto_adopt_error", report_id=report.report_id, error=str(e))
if auto_adopted:
# 自動採納成功Telegram 通知已在 auto_adopt_if_safe 內發出,不再推送按鈕卡片
return
if auto_block_reason:
await _escalate_drift_auto_adopt_blocked(
report=report,
reason=auto_block_reason,
interpretation=interpretation,
)
# ADR-075: drift_narrator_service 負責發送 TYPE-4D 卡片(含按鈕)
# 舊的 send_text() 已移除,改由 narrate_and_notify() 統一處理
try:
@@ -224,6 +234,25 @@ async def _analyze_and_notify(report: DriftReport) -> None:
structlog.get_logger(__name__).error("drift_analyze_notify_failed", error=str(e))
async def _escalate_drift_auto_adopt_blocked(
*,
report: DriftReport,
reason: str,
interpretation,
) -> None:
"""Delegate drift emergency escalation to the service layer."""
from src.services.emergency_escalation_service import (
escalate_drift_auto_adopt_blocked,
)
await escalate_drift_auto_adopt_blocked(
report=report,
reason=reason,
interpretation=interpretation,
)
async def _run_full_scan(namespaces: list[str]) -> None:
"""背景:完整漂移掃描"""
detector = get_drift_detector()

View File

@@ -110,6 +110,34 @@ def _should_bypass_alertmanager_llm(
and alert_category == "host_resource"
)
async def _escalate_auto_repair_unavailable(
*,
incident_id: str,
approval_id: str,
alert_type: str,
target_resource: str,
namespace: str,
failure_reason: str,
attempted_actions: str,
) -> None:
"""Delegate automation-blocker escalation to the service layer."""
from src.services.emergency_escalation_service import (
escalate_auto_repair_unavailable,
)
await escalate_auto_repair_unavailable(
incident_id=incident_id,
approval_id=approval_id,
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason=failure_reason,
attempted_actions=attempted_actions,
)
async def _try_auto_repair_background(
incident_id: str,
approval_id: str,
@@ -165,6 +193,15 @@ async def _try_auto_repair_background(
"playbook_id": decision.playbook.playbook_id if decision.playbook else None,
},
)
await _escalate_auto_repair_unavailable(
incident_id=incident_id,
approval_id=approval_id,
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason=decision.reason,
attempted_actions=f"evaluate_auto_repair -> blocked:{decision.blocked_by}",
)
return
# 記錄自動修復觸發 (Sprint 5.1 Q10: 加入 Langfuse trace_id 追蹤)
@@ -228,6 +265,16 @@ async def _try_auto_repair_background(
"namespace": namespace,
},
)
if not result.success:
await _escalate_auto_repair_unavailable(
incident_id=incident_id,
approval_id=approval_id,
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason=result.error or "auto repair execution failed",
attempted_actions=f"execute_auto_repair playbook:{result.playbook_id}",
)
# ADR-073 Phase 2-3: 自動修復結果 → 寫入 Incident outcome (2026-04-12 ogt)
# 讓 KMConversionService 可依 outcome 判斷是否為 EXECUTION_SUCCESS