From e4aef6ac4e83c877735e21ee05376f7fca6b8b91 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 1 May 2026 10:33:28 +0800 Subject: [PATCH] fix(aiops): block k8s playbooks for host repair --- apps/api/src/services/auto_repair_service.py | 40 ++++++++++++ apps/api/tests/test_auto_repair_service.py | 65 +++++++++++++++++++- docs/LOGBOOK.md | 2 + 3 files changed, 105 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 7bf1d86a..92115201 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -337,6 +337,23 @@ class AutoRepairService: blocked_by="NOT_APPROVED", ) + if self._is_host_or_backup_incident(incident) and self._playbook_has_k8s_steps(best_match.playbook): + logger.warning( + "auto_repair_blocked_host_backup_k8s_playbook", + incident_id=incident.incident_id, + playbook_id=best_match.playbook.playbook_id, + alert_category=getattr(incident, "alert_category", None), + ) + return AutoRepairDecision( + can_auto_repair=False, + playbook=best_match.playbook, + reason=( + "主機/備份類告警禁止執行 K8s Playbook;" + "需改走 SSH 診斷或緊急介入" + ), + blocked_by="HOST_BACKUP_K8S_PLAYBOOK", + ) + # 5. 可以自動修復 logger.info( "auto_repair_approved", @@ -676,6 +693,29 @@ class AutoRepairService: return max_risk + def _is_host_or_backup_incident(self, incident: Incident) -> bool: + """主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。""" + + category = (getattr(incident, "alert_category", None) or "").lower() + if category in {"host_resource", "backup_failure"}: + return True + + for signal in incident.signals or []: + labels = signal.labels or {} + alertname = str(labels.get("alertname") or signal.alert_name or "") + if alertname.startswith("HostBackup") or alertname.startswith("Host"): + return True + return False + + def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool: + """檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。""" + + for step in playbook.repair_steps: + command = (step.command or "").strip().lower() + if step.action_type == ActionType.KUBECTL or command.startswith("kubectl "): + return True + return False + def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool: """檢查風險是否超過自動修復門檻""" high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL} diff --git a/apps/api/tests/test_auto_repair_service.py b/apps/api/tests/test_auto_repair_service.py index 971ce35e..b2eae3eb 100644 --- a/apps/api/tests/test_auto_repair_service.py +++ b/apps/api/tests/test_auto_repair_service.py @@ -55,6 +55,8 @@ class MockPlaybookService: def create_test_incident( incident_id: str = "INC-TEST-001", severity: Severity = Severity.P2, + alert_category: str | None = None, + alert_name: str = "HighCPU", ) -> Incident: """Create a test incident""" now = now_taipei() @@ -63,13 +65,14 @@ def create_test_incident( status=IncidentStatus.INVESTIGATING, severity=severity, affected_services=["test-service"], + alert_category=alert_category, signals=[ Signal( - alert_name="HighCPU", + alert_name=alert_name, severity=severity, source="prometheus", fired_at=now, - labels={"namespace": "prod"}, + labels={"namespace": "prod", "alertname": alert_name}, ), ], ) @@ -274,6 +277,64 @@ class TestAutoRepairService: assert decision.playbook.playbook_id == playbook.playbook_id assert decision.blocked_by is None + @pytest.mark.asyncio + async def test_backup_failure_blocks_k8s_playbook(self, service, mock_playbook_service): + """Backup/host incidents must not execute K8s rollout playbooks.""" + playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM) + mock_playbook_service.add_playbook(playbook) + mock_playbook_service.set_recommendations([ + MockPlaybookRecommendation(playbook, similarity_score=0.85) + ]) + + incident = create_test_incident( + severity=Severity.P2, + alert_category="backup_failure", + alert_name="HostBackupFailed", + ) + decision = await service.evaluate_auto_repair(incident) + + assert decision.can_auto_repair is False + assert decision.blocked_by == "HOST_BACKUP_K8S_PLAYBOOK" + + @pytest.mark.asyncio + async def test_backup_failure_allows_ssh_playbook(self, service, mock_playbook_service): + """Backup/host incidents may still use SSH playbooks.""" + playbook = Playbook( + playbook_id="PB-BACKUP-SSH", + name="Backup SSH diagnostics", + description="Read-only backup diagnosis", + status=PlaybookStatus.APPROVED, + symptom_pattern=SymptomPattern( + alert_names=["HostBackupFailed"], + affected_services=["test-service"], + ), + repair_steps=[ + RepairStep( + step_number=1, + action_type=ActionType.SSH_COMMAND, + command="ssh {host} 'tail -80 /var/log/backup.log'", + risk_level=RiskLevel.LOW, + description="collect backup logs", + ), + ], + success_count=20, + failure_count=1, + ) + mock_playbook_service.add_playbook(playbook) + mock_playbook_service.set_recommendations([ + MockPlaybookRecommendation(playbook, similarity_score=0.85) + ]) + + incident = create_test_incident( + severity=Severity.P2, + alert_category="backup_failure", + alert_name="HostBackupFailed", + ) + decision = await service.evaluate_auto_repair(incident) + + assert decision.can_auto_repair is True + assert decision.blocked_by is None + @pytest.mark.asyncio async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service): """Test that LOW risk actions are allowed""" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e71e30ea..e149f75d 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -12,12 +12,14 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會 ### 完成 - `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。 +- `AutoRepairService` 追加 host/backup Playbook guard:主機/備份 incident 若匹配到 K8s rollout 類 Playbook,阻擋為 `HOST_BACKUP_K8S_PLAYBOOK`,改走緊急介入。 - `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。 - 補 `backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。 ### 驗證 - `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。 - `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。 +- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 27 passed。 - YAML parse `ops/monitoring/alerts-unified.yml`、`apps/api/alert_rules.yaml` 通過。 ## 2026-04-30 | ADR-104 Playbook 版本化 lineage