diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 23774b49..81e6c746 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -319,6 +319,7 @@ async def _try_auto_repair_background( # auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線 # 人工審核路徑已在 approval_execution._run_post_execution_verify 接線, # 此處補齊 auto_repair 路徑的對稱接線(ADR-081 Phase 1 + ADR-083 Phase 3) + _post_verify_result: str | None = None if result: from src.core.feature_flags import aiops_flags if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): @@ -345,6 +346,7 @@ async def _try_auto_repair_background( ), timeout=60.0, ) + _post_verify_result = _verify_result logger.info( "auto_repair_post_verify_complete", incident_id=incident_id, @@ -376,6 +378,30 @@ async def _try_auto_repair_background( # ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換 # 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成 if result and result.success: + if _post_verify_result in ("failed", "degraded", "timeout"): + logger.warning( + "incident_resolve_after_auto_repair_skipped_verification", + incident_id=incident_id, + approval_id=approval_id, + verification_result=_post_verify_result, + playbook_id=result.playbook_id, + ) + await _escalate_auto_repair_unavailable( + incident_id=incident_id, + approval_id=approval_id, + alert_type=alert_type, + target_resource=target_resource, + namespace=namespace, + failure_reason=( + f"auto repair execution succeeded but post verification " + f"returned {_post_verify_result}; incident remains open" + ), + attempted_actions=( + f"execute_auto_repair playbook:{result.playbook_id} " + f"-> verifier:{_post_verify_result} -> emergency_intervention" + ), + ) + return try: _inc_svc_resolve = get_incident_service() await _inc_svc_resolve.resolve_incident(incident_id) diff --git a/apps/api/src/repositories/alert_operation_log_repository.py b/apps/api/src/repositories/alert_operation_log_repository.py index 540be94f..2c6e5db5 100644 --- a/apps/api/src/repositories/alert_operation_log_repository.py +++ b/apps/api/src/repositories/alert_operation_log_repository.py @@ -43,6 +43,7 @@ ALERT_EVENT_TYPES = { "BACKUP_COMPLETED", "BACKUP_FAILED", "APPROVAL_ESCALATED", + "EMERGENCY_ESCALATED", "CHANGE_APPLIED", # ADR-071 通知生命週期 (2026-04-11 Claude Sonnet 4.6 Asia/Taipei) "NOTIFICATION_CLASSIFIED", diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 92115201..fb44ecbd 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -521,6 +521,13 @@ class AutoRepairService: # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合 # PostExecutionVerifier 判斷失敗/降級 → 觸發自動 Rollback if verification_result in ("failed", "degraded"): + if self._should_escalate_failed_verification(incident, playbook): + await self._escalate_failed_verification( + incident=incident, + playbook=playbook, + verification_result=verification_result, + ) + return try: from src.services.rollback_manager import get_rollback_manager from src.services.declarative_remediation import DeclarativeRemediation @@ -716,6 +723,69 @@ class AutoRepairService: return True return False + def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool: + """非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。""" + + return self._is_host_or_backup_incident(incident) or not self._playbook_has_k8s_steps(playbook) + + async def _escalate_failed_verification( + self, + *, + incident: Incident, + playbook: Playbook, + verification_result: str, + ) -> None: + """Post-verification failed but rollback is unsafe; notify emergency channel.""" + + target = (incident.affected_services or ["unknown"])[0] + namespace = "awoooi-prod" + alert_type = self._incident_alert_type(incident) + reason = ( + f"auto repair playbook {playbook.playbook_id} verification={verification_result}; " + "rollback is unsafe for host/backup or non-K8s remediation" + ) + logger.warning( + "auto_repair_verification_failed_emergency", + incident_id=incident.incident_id, + playbook_id=playbook.playbook_id, + verification_result=verification_result, + target=target, + ) + try: + from src.services.emergency_escalation_service import ( + escalate_auto_repair_unavailable, + ) + + await escalate_auto_repair_unavailable( + incident_id=incident.incident_id, + approval_id=None, + alert_type=alert_type, + target_resource=target, + namespace=namespace, + failure_reason=reason, + attempted_actions=( + f"auto_repair:{playbook.playbook_id} -> verifier:{verification_result} " + "-> emergency_intervention" + ), + ) + except Exception as exc: + logger.warning( + "auto_repair_verification_emergency_failed", + incident_id=incident.incident_id, + playbook_id=playbook.playbook_id, + error=str(exc), + ) + + def _incident_alert_type(self, incident: Incident) -> str: + """Best-effort alertname for emergency cards.""" + + for signal in incident.signals or []: + labels = signal.labels or {} + alertname = labels.get("alertname") or signal.alert_name + if alertname: + return str(alertname) + return "AutoRepairVerificationFailed" + def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool: """檢查風險是否超過自動修復門檻""" high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL} diff --git a/apps/api/src/services/emergency_escalation_service.py b/apps/api/src/services/emergency_escalation_service.py index 21b17a22..40a1f134 100644 --- a/apps/api/src/services/emergency_escalation_service.py +++ b/apps/api/src/services/emergency_escalation_service.py @@ -19,7 +19,7 @@ logger = structlog.get_logger(__name__) async def escalate_auto_repair_unavailable( *, incident_id: str, - approval_id: str, + approval_id: str | None, alert_type: str, target_resource: str, namespace: str, diff --git a/apps/api/tests/test_auto_repair_service.py b/apps/api/tests/test_auto_repair_service.py index b2eae3eb..c12561c0 100644 --- a/apps/api/tests/test_auto_repair_service.py +++ b/apps/api/tests/test_auto_repair_service.py @@ -335,6 +335,43 @@ class TestAutoRepairService: assert decision.can_auto_repair is True assert decision.blocked_by is None + def test_failed_verification_escalates_for_host_backup_ssh_playbook(self, service): + """Failed backup SSH diagnostics must not synthesize K8s rollback.""" + playbook = Playbook( + playbook_id="PB-BACKUP-SSH", + name="Backup SSH diagnostics", + description="Read-only backup diagnosis", + status=PlaybookStatus.APPROVED, + symptom_pattern=SymptomPattern( + alert_names=["HostBackupFailed"], + affected_services=["test-service"], + ), + repair_steps=[ + RepairStep( + step_number=1, + action_type=ActionType.SSH_COMMAND, + command="ssh {host} 'tail -80 /var/log/backup.log'", + risk_level=RiskLevel.LOW, + ), + ], + success_count=20, + failure_count=1, + ) + incident = create_test_incident( + severity=Severity.P2, + alert_category="backup_failure", + alert_name="HostBackupFailed", + ) + + assert service._should_escalate_failed_verification(incident, playbook) is True + + def test_failed_verification_allows_k8s_rollback_for_k8s_playbook(self, service): + """K8s playbooks may still use the existing K8s rollback path.""" + playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM) + incident = create_test_incident(severity=Severity.P2) + + assert service._should_escalate_failed_verification(incident, playbook) is False + @pytest.mark.asyncio async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service): """Test that LOW risk actions are allowed""" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e149f75d..388fefaa 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -13,6 +13,7 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會 ### 完成 - `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。 - `AutoRepairService` 追加 host/backup Playbook guard:主機/備份 incident 若匹配到 K8s rollout 類 Playbook,阻擋為 `HOST_BACKUP_K8S_PLAYBOOK`,改走緊急介入。 +- `AutoRepairService` post-verification rollback guard:host/backup 或非 K8s Playbook 驗證失敗時,不再合成 `kubectl rollout restart deployment/{target}`,改走 emergency escalation,且不自動 resolve incident。 - `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。 - 補 `backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。 @@ -20,6 +21,7 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會 - `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。 - `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。 - `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 27 passed。 +- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 29 passed。 - YAML parse `ops/monitoring/alerts-unified.yml`、`apps/api/alert_rules.yaml` 通過。 ## 2026-04-30 | ADR-104 Playbook 版本化 lineage