fix(aiops): escalate failed host verification
This commit is contained in:
@@ -319,6 +319,7 @@ async def _try_auto_repair_background(
|
||||
# auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線
|
||||
# 人工審核路徑已在 approval_execution._run_post_execution_verify 接線,
|
||||
# 此處補齊 auto_repair 路徑的對稱接線(ADR-081 Phase 1 + ADR-083 Phase 3)
|
||||
_post_verify_result: str | None = None
|
||||
if result:
|
||||
from src.core.feature_flags import aiops_flags
|
||||
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||||
@@ -345,6 +346,7 @@ async def _try_auto_repair_background(
|
||||
),
|
||||
timeout=60.0,
|
||||
)
|
||||
_post_verify_result = _verify_result
|
||||
logger.info(
|
||||
"auto_repair_post_verify_complete",
|
||||
incident_id=incident_id,
|
||||
@@ -376,6 +378,30 @@ async def _try_auto_repair_background(
|
||||
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
|
||||
# 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成
|
||||
if result and result.success:
|
||||
if _post_verify_result in ("failed", "degraded", "timeout"):
|
||||
logger.warning(
|
||||
"incident_resolve_after_auto_repair_skipped_verification",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
verification_result=_post_verify_result,
|
||||
playbook_id=result.playbook_id,
|
||||
)
|
||||
await _escalate_auto_repair_unavailable(
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
failure_reason=(
|
||||
f"auto repair execution succeeded but post verification "
|
||||
f"returned {_post_verify_result}; incident remains open"
|
||||
),
|
||||
attempted_actions=(
|
||||
f"execute_auto_repair playbook:{result.playbook_id} "
|
||||
f"-> verifier:{_post_verify_result} -> emergency_intervention"
|
||||
),
|
||||
)
|
||||
return
|
||||
try:
|
||||
_inc_svc_resolve = get_incident_service()
|
||||
await _inc_svc_resolve.resolve_incident(incident_id)
|
||||
|
||||
@@ -43,6 +43,7 @@ ALERT_EVENT_TYPES = {
|
||||
"BACKUP_COMPLETED",
|
||||
"BACKUP_FAILED",
|
||||
"APPROVAL_ESCALATED",
|
||||
"EMERGENCY_ESCALATED",
|
||||
"CHANGE_APPLIED",
|
||||
# ADR-071 通知生命週期 (2026-04-11 Claude Sonnet 4.6 Asia/Taipei)
|
||||
"NOTIFICATION_CLASSIFIED",
|
||||
|
||||
@@ -521,6 +521,13 @@ class AutoRepairService:
|
||||
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
|
||||
# PostExecutionVerifier 判斷失敗/降級 → 觸發自動 Rollback
|
||||
if verification_result in ("failed", "degraded"):
|
||||
if self._should_escalate_failed_verification(incident, playbook):
|
||||
await self._escalate_failed_verification(
|
||||
incident=incident,
|
||||
playbook=playbook,
|
||||
verification_result=verification_result,
|
||||
)
|
||||
return
|
||||
try:
|
||||
from src.services.rollback_manager import get_rollback_manager
|
||||
from src.services.declarative_remediation import DeclarativeRemediation
|
||||
@@ -716,6 +723,69 @@ class AutoRepairService:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
|
||||
"""非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。"""
|
||||
|
||||
return self._is_host_or_backup_incident(incident) or not self._playbook_has_k8s_steps(playbook)
|
||||
|
||||
async def _escalate_failed_verification(
|
||||
self,
|
||||
*,
|
||||
incident: Incident,
|
||||
playbook: Playbook,
|
||||
verification_result: str,
|
||||
) -> None:
|
||||
"""Post-verification failed but rollback is unsafe; notify emergency channel."""
|
||||
|
||||
target = (incident.affected_services or ["unknown"])[0]
|
||||
namespace = "awoooi-prod"
|
||||
alert_type = self._incident_alert_type(incident)
|
||||
reason = (
|
||||
f"auto repair playbook {playbook.playbook_id} verification={verification_result}; "
|
||||
"rollback is unsafe for host/backup or non-K8s remediation"
|
||||
)
|
||||
logger.warning(
|
||||
"auto_repair_verification_failed_emergency",
|
||||
incident_id=incident.incident_id,
|
||||
playbook_id=playbook.playbook_id,
|
||||
verification_result=verification_result,
|
||||
target=target,
|
||||
)
|
||||
try:
|
||||
from src.services.emergency_escalation_service import (
|
||||
escalate_auto_repair_unavailable,
|
||||
)
|
||||
|
||||
await escalate_auto_repair_unavailable(
|
||||
incident_id=incident.incident_id,
|
||||
approval_id=None,
|
||||
alert_type=alert_type,
|
||||
target_resource=target,
|
||||
namespace=namespace,
|
||||
failure_reason=reason,
|
||||
attempted_actions=(
|
||||
f"auto_repair:{playbook.playbook_id} -> verifier:{verification_result} "
|
||||
"-> emergency_intervention"
|
||||
),
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"auto_repair_verification_emergency_failed",
|
||||
incident_id=incident.incident_id,
|
||||
playbook_id=playbook.playbook_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
def _incident_alert_type(self, incident: Incident) -> str:
|
||||
"""Best-effort alertname for emergency cards."""
|
||||
|
||||
for signal in incident.signals or []:
|
||||
labels = signal.labels or {}
|
||||
alertname = labels.get("alertname") or signal.alert_name
|
||||
if alertname:
|
||||
return str(alertname)
|
||||
return "AutoRepairVerificationFailed"
|
||||
|
||||
def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
|
||||
"""檢查風險是否超過自動修復門檻"""
|
||||
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}
|
||||
|
||||
@@ -19,7 +19,7 @@ logger = structlog.get_logger(__name__)
|
||||
async def escalate_auto_repair_unavailable(
|
||||
*,
|
||||
incident_id: str,
|
||||
approval_id: str,
|
||||
approval_id: str | None,
|
||||
alert_type: str,
|
||||
target_resource: str,
|
||||
namespace: str,
|
||||
|
||||
@@ -335,6 +335,43 @@ class TestAutoRepairService:
|
||||
assert decision.can_auto_repair is True
|
||||
assert decision.blocked_by is None
|
||||
|
||||
def test_failed_verification_escalates_for_host_backup_ssh_playbook(self, service):
|
||||
"""Failed backup SSH diagnostics must not synthesize K8s rollback."""
|
||||
playbook = Playbook(
|
||||
playbook_id="PB-BACKUP-SSH",
|
||||
name="Backup SSH diagnostics",
|
||||
description="Read-only backup diagnosis",
|
||||
status=PlaybookStatus.APPROVED,
|
||||
symptom_pattern=SymptomPattern(
|
||||
alert_names=["HostBackupFailed"],
|
||||
affected_services=["test-service"],
|
||||
),
|
||||
repair_steps=[
|
||||
RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.SSH_COMMAND,
|
||||
command="ssh {host} 'tail -80 /var/log/backup.log'",
|
||||
risk_level=RiskLevel.LOW,
|
||||
),
|
||||
],
|
||||
success_count=20,
|
||||
failure_count=1,
|
||||
)
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="backup_failure",
|
||||
alert_name="HostBackupFailed",
|
||||
)
|
||||
|
||||
assert service._should_escalate_failed_verification(incident, playbook) is True
|
||||
|
||||
def test_failed_verification_allows_k8s_rollback_for_k8s_playbook(self, service):
|
||||
"""K8s playbooks may still use the existing K8s rollback path."""
|
||||
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
|
||||
assert service._should_escalate_failed_verification(incident, playbook) is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
|
||||
"""Test that LOW risk actions are allowed"""
|
||||
|
||||
@@ -13,6 +13,7 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會
|
||||
### 完成
|
||||
- `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。
|
||||
- `AutoRepairService` 追加 host/backup Playbook guard:主機/備份 incident 若匹配到 K8s rollout 類 Playbook,阻擋為 `HOST_BACKUP_K8S_PLAYBOOK`,改走緊急介入。
|
||||
- `AutoRepairService` post-verification rollback guard:host/backup 或非 K8s Playbook 驗證失敗時,不再合成 `kubectl rollout restart deployment/{target}`,改走 emergency escalation,且不自動 resolve incident。
|
||||
- `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。
|
||||
- 補 `backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。
|
||||
|
||||
@@ -20,6 +21,7 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會
|
||||
- `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。
|
||||
- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。
|
||||
- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 27 passed。
|
||||
- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 29 passed。
|
||||
- YAML parse `ops/monitoring/alerts-unified.yml`、`apps/api/alert_rules.yaml` 通過。
|
||||
|
||||
## 2026-04-30 | ADR-104 Playbook 版本化 lineage
|
||||
|
||||
Reference in New Issue
Block a user