fix(aiops): escalate failed host verification
Some checks failed
CD Pipeline / tests (push) Successful in 1m27s
Code Review / ai-code-review (push) Successful in 29s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-05-01 10:47:29 +08:00
parent 046d598e88
commit 97be5dedd7
6 changed files with 137 additions and 1 deletions

View File

@@ -319,6 +319,7 @@ async def _try_auto_repair_background(
# auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線
# 人工審核路徑已在 approval_execution._run_post_execution_verify 接線,
# 此處補齊 auto_repair 路徑的對稱接線ADR-081 Phase 1 + ADR-083 Phase 3
_post_verify_result: str | None = None
if result:
from src.core.feature_flags import aiops_flags
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
@@ -345,6 +346,7 @@ async def _try_auto_repair_background(
),
timeout=60.0,
)
_post_verify_result = _verify_result
logger.info(
"auto_repair_post_verify_complete",
incident_id=incident_id,
@@ -376,6 +378,30 @@ async def _try_auto_repair_background(
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
# 之前 auto_repair 成功後從未呼叫 resolve_incidentKM 永遠不生成
if result and result.success:
if _post_verify_result in ("failed", "degraded", "timeout"):
logger.warning(
"incident_resolve_after_auto_repair_skipped_verification",
incident_id=incident_id,
approval_id=approval_id,
verification_result=_post_verify_result,
playbook_id=result.playbook_id,
)
await _escalate_auto_repair_unavailable(
incident_id=incident_id,
approval_id=approval_id,
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason=(
f"auto repair execution succeeded but post verification "
f"returned {_post_verify_result}; incident remains open"
),
attempted_actions=(
f"execute_auto_repair playbook:{result.playbook_id} "
f"-> verifier:{_post_verify_result} -> emergency_intervention"
),
)
return
try:
_inc_svc_resolve = get_incident_service()
await _inc_svc_resolve.resolve_incident(incident_id)