fix(aiops): escalate failed host verification
This commit is contained in:
@@ -319,6 +319,7 @@ async def _try_auto_repair_background(
|
||||
# auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線
|
||||
# 人工審核路徑已在 approval_execution._run_post_execution_verify 接線,
|
||||
# 此處補齊 auto_repair 路徑的對稱接線(ADR-081 Phase 1 + ADR-083 Phase 3)
|
||||
_post_verify_result: str | None = None
|
||||
if result:
|
||||
from src.core.feature_flags import aiops_flags
|
||||
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||||
@@ -345,6 +346,7 @@ async def _try_auto_repair_background(
|
||||
),
|
||||
timeout=60.0,
|
||||
)
|
||||
_post_verify_result = _verify_result
|
||||
logger.info(
|
||||
"auto_repair_post_verify_complete",
|
||||
incident_id=incident_id,
|
||||
@@ -376,6 +378,30 @@ async def _try_auto_repair_background(
|
||||
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
|
||||
# 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成
|
||||
if result and result.success:
|
||||
if _post_verify_result in ("failed", "degraded", "timeout"):
|
||||
logger.warning(
|
||||
"incident_resolve_after_auto_repair_skipped_verification",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
verification_result=_post_verify_result,
|
||||
playbook_id=result.playbook_id,
|
||||
)
|
||||
await _escalate_auto_repair_unavailable(
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
failure_reason=(
|
||||
f"auto repair execution succeeded but post verification "
|
||||
f"returned {_post_verify_result}; incident remains open"
|
||||
),
|
||||
attempted_actions=(
|
||||
f"execute_auto_repair playbook:{result.playbook_id} "
|
||||
f"-> verifier:{_post_verify_result} -> emergency_intervention"
|
||||
),
|
||||
)
|
||||
return
|
||||
try:
|
||||
_inc_svc_resolve = get_incident_service()
|
||||
await _inc_svc_resolve.resolve_incident(incident_id)
|
||||
|
||||
Reference in New Issue
Block a user