fix(aiops): escalate failed host verification
Some checks failed
CD Pipeline / tests (push) Successful in 1m27s
Code Review / ai-code-review (push) Successful in 29s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-05-01 10:47:29 +08:00
parent 046d598e88
commit 97be5dedd7
6 changed files with 137 additions and 1 deletions

View File

@@ -319,6 +319,7 @@ async def _try_auto_repair_background(
# auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線
# 人工審核路徑已在 approval_execution._run_post_execution_verify 接線,
# 此處補齊 auto_repair 路徑的對稱接線ADR-081 Phase 1 + ADR-083 Phase 3
_post_verify_result: str | None = None
if result:
from src.core.feature_flags import aiops_flags
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
@@ -345,6 +346,7 @@ async def _try_auto_repair_background(
),
timeout=60.0,
)
_post_verify_result = _verify_result
logger.info(
"auto_repair_post_verify_complete",
incident_id=incident_id,
@@ -376,6 +378,30 @@ async def _try_auto_repair_background(
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
# 之前 auto_repair 成功後從未呼叫 resolve_incidentKM 永遠不生成
if result and result.success:
if _post_verify_result in ("failed", "degraded", "timeout"):
logger.warning(
"incident_resolve_after_auto_repair_skipped_verification",
incident_id=incident_id,
approval_id=approval_id,
verification_result=_post_verify_result,
playbook_id=result.playbook_id,
)
await _escalate_auto_repair_unavailable(
incident_id=incident_id,
approval_id=approval_id,
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
failure_reason=(
f"auto repair execution succeeded but post verification "
f"returned {_post_verify_result}; incident remains open"
),
attempted_actions=(
f"execute_auto_repair playbook:{result.playbook_id} "
f"-> verifier:{_post_verify_result} -> emergency_intervention"
),
)
return
try:
_inc_svc_resolve = get_incident_service()
await _inc_svc_resolve.resolve_incident(incident_id)

View File

@@ -43,6 +43,7 @@ ALERT_EVENT_TYPES = {
"BACKUP_COMPLETED",
"BACKUP_FAILED",
"APPROVAL_ESCALATED",
"EMERGENCY_ESCALATED",
"CHANGE_APPLIED",
# ADR-071 通知生命週期 (2026-04-11 Claude Sonnet 4.6 Asia/Taipei)
"NOTIFICATION_CLASSIFIED",

View File

@@ -521,6 +521,13 @@ class AutoRepairService:
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
# PostExecutionVerifier 判斷失敗/降級 → 觸發自動 Rollback
if verification_result in ("failed", "degraded"):
if self._should_escalate_failed_verification(incident, playbook):
await self._escalate_failed_verification(
incident=incident,
playbook=playbook,
verification_result=verification_result,
)
return
try:
from src.services.rollback_manager import get_rollback_manager
from src.services.declarative_remediation import DeclarativeRemediation
@@ -716,6 +723,69 @@ class AutoRepairService:
return True
return False
def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
"""非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。"""
return self._is_host_or_backup_incident(incident) or not self._playbook_has_k8s_steps(playbook)
async def _escalate_failed_verification(
self,
*,
incident: Incident,
playbook: Playbook,
verification_result: str,
) -> None:
"""Post-verification failed but rollback is unsafe; notify emergency channel."""
target = (incident.affected_services or ["unknown"])[0]
namespace = "awoooi-prod"
alert_type = self._incident_alert_type(incident)
reason = (
f"auto repair playbook {playbook.playbook_id} verification={verification_result}; "
"rollback is unsafe for host/backup or non-K8s remediation"
)
logger.warning(
"auto_repair_verification_failed_emergency",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
verification_result=verification_result,
target=target,
)
try:
from src.services.emergency_escalation_service import (
escalate_auto_repair_unavailable,
)
await escalate_auto_repair_unavailable(
incident_id=incident.incident_id,
approval_id=None,
alert_type=alert_type,
target_resource=target,
namespace=namespace,
failure_reason=reason,
attempted_actions=(
f"auto_repair:{playbook.playbook_id} -> verifier:{verification_result} "
"-> emergency_intervention"
),
)
except Exception as exc:
logger.warning(
"auto_repair_verification_emergency_failed",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
error=str(exc),
)
def _incident_alert_type(self, incident: Incident) -> str:
"""Best-effort alertname for emergency cards."""
for signal in incident.signals or []:
labels = signal.labels or {}
alertname = labels.get("alertname") or signal.alert_name
if alertname:
return str(alertname)
return "AutoRepairVerificationFailed"
def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
"""檢查風險是否超過自動修復門檻"""
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}

View File

@@ -19,7 +19,7 @@ logger = structlog.get_logger(__name__)
async def escalate_auto_repair_unavailable(
*,
incident_id: str,
approval_id: str,
approval_id: str | None,
alert_type: str,
target_resource: str,
namespace: str,

View File

@@ -335,6 +335,43 @@ class TestAutoRepairService:
assert decision.can_auto_repair is True
assert decision.blocked_by is None
def test_failed_verification_escalates_for_host_backup_ssh_playbook(self, service):
"""Failed backup SSH diagnostics must not synthesize K8s rollback."""
playbook = Playbook(
playbook_id="PB-BACKUP-SSH",
name="Backup SSH diagnostics",
description="Read-only backup diagnosis",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["HostBackupFailed"],
affected_services=["test-service"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.SSH_COMMAND,
command="ssh {host} 'tail -80 /var/log/backup.log'",
risk_level=RiskLevel.LOW,
),
],
success_count=20,
failure_count=1,
)
incident = create_test_incident(
severity=Severity.P2,
alert_category="backup_failure",
alert_name="HostBackupFailed",
)
assert service._should_escalate_failed_verification(incident, playbook) is True
def test_failed_verification_allows_k8s_rollback_for_k8s_playbook(self, service):
"""K8s playbooks may still use the existing K8s rollback path."""
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
incident = create_test_incident(severity=Severity.P2)
assert service._should_escalate_failed_verification(incident, playbook) is False
@pytest.mark.asyncio
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
"""Test that LOW risk actions are allowed"""

View File

@@ -13,6 +13,7 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會
### 完成
- `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。
- `AutoRepairService` 追加 host/backup Playbook guard主機/備份 incident 若匹配到 K8s rollout 類 Playbook阻擋為 `HOST_BACKUP_K8S_PLAYBOOK`,改走緊急介入。
- `AutoRepairService` post-verification rollback guardhost/backup 或非 K8s Playbook 驗證失敗時,不再合成 `kubectl rollout restart deployment/{target}`,改走 emergency escalation且不自動 resolve incident。
- `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。
-`backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。
@@ -20,6 +21,7 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會
- `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。
- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。
- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 27 passed。
- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 29 passed。
- YAML parse `ops/monitoring/alerts-unified.yml``apps/api/alert_rules.yaml` 通過。
## 2026-04-30 | ADR-104 Playbook 版本化 lineage