fix(aiops): block k8s playbooks for host repair
This commit is contained in:
@@ -337,6 +337,23 @@ class AutoRepairService:
|
||||
blocked_by="NOT_APPROVED",
|
||||
)
|
||||
|
||||
if self._is_host_or_backup_incident(incident) and self._playbook_has_k8s_steps(best_match.playbook):
|
||||
logger.warning(
|
||||
"auto_repair_blocked_host_backup_k8s_playbook",
|
||||
incident_id=incident.incident_id,
|
||||
playbook_id=best_match.playbook.playbook_id,
|
||||
alert_category=getattr(incident, "alert_category", None),
|
||||
)
|
||||
return AutoRepairDecision(
|
||||
can_auto_repair=False,
|
||||
playbook=best_match.playbook,
|
||||
reason=(
|
||||
"主機/備份類告警禁止執行 K8s Playbook;"
|
||||
"需改走 SSH 診斷或緊急介入"
|
||||
),
|
||||
blocked_by="HOST_BACKUP_K8S_PLAYBOOK",
|
||||
)
|
||||
|
||||
# 5. 可以自動修復
|
||||
logger.info(
|
||||
"auto_repair_approved",
|
||||
@@ -676,6 +693,29 @@ class AutoRepairService:
|
||||
|
||||
return max_risk
|
||||
|
||||
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
|
||||
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""
|
||||
|
||||
category = (getattr(incident, "alert_category", None) or "").lower()
|
||||
if category in {"host_resource", "backup_failure"}:
|
||||
return True
|
||||
|
||||
for signal in incident.signals or []:
|
||||
labels = signal.labels or {}
|
||||
alertname = str(labels.get("alertname") or signal.alert_name or "")
|
||||
if alertname.startswith("HostBackup") or alertname.startswith("Host"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
|
||||
"""檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。"""
|
||||
|
||||
for step in playbook.repair_steps:
|
||||
command = (step.command or "").strip().lower()
|
||||
if step.action_type == ActionType.KUBECTL or command.startswith("kubectl "):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
|
||||
"""檢查風險是否超過自動修復門檻"""
|
||||
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}
|
||||
|
||||
@@ -55,6 +55,8 @@ class MockPlaybookService:
|
||||
def create_test_incident(
|
||||
incident_id: str = "INC-TEST-001",
|
||||
severity: Severity = Severity.P2,
|
||||
alert_category: str | None = None,
|
||||
alert_name: str = "HighCPU",
|
||||
) -> Incident:
|
||||
"""Create a test incident"""
|
||||
now = now_taipei()
|
||||
@@ -63,13 +65,14 @@ def create_test_incident(
|
||||
status=IncidentStatus.INVESTIGATING,
|
||||
severity=severity,
|
||||
affected_services=["test-service"],
|
||||
alert_category=alert_category,
|
||||
signals=[
|
||||
Signal(
|
||||
alert_name="HighCPU",
|
||||
alert_name=alert_name,
|
||||
severity=severity,
|
||||
source="prometheus",
|
||||
fired_at=now,
|
||||
labels={"namespace": "prod"},
|
||||
labels={"namespace": "prod", "alertname": alert_name},
|
||||
),
|
||||
],
|
||||
)
|
||||
@@ -274,6 +277,64 @@ class TestAutoRepairService:
|
||||
assert decision.playbook.playbook_id == playbook.playbook_id
|
||||
assert decision.blocked_by is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_backup_failure_blocks_k8s_playbook(self, service, mock_playbook_service):
|
||||
"""Backup/host incidents must not execute K8s rollout playbooks."""
|
||||
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
mock_playbook_service.set_recommendations([
|
||||
MockPlaybookRecommendation(playbook, similarity_score=0.85)
|
||||
])
|
||||
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="backup_failure",
|
||||
alert_name="HostBackupFailed",
|
||||
)
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "HOST_BACKUP_K8S_PLAYBOOK"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_backup_failure_allows_ssh_playbook(self, service, mock_playbook_service):
|
||||
"""Backup/host incidents may still use SSH playbooks."""
|
||||
playbook = Playbook(
|
||||
playbook_id="PB-BACKUP-SSH",
|
||||
name="Backup SSH diagnostics",
|
||||
description="Read-only backup diagnosis",
|
||||
status=PlaybookStatus.APPROVED,
|
||||
symptom_pattern=SymptomPattern(
|
||||
alert_names=["HostBackupFailed"],
|
||||
affected_services=["test-service"],
|
||||
),
|
||||
repair_steps=[
|
||||
RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.SSH_COMMAND,
|
||||
command="ssh {host} 'tail -80 /var/log/backup.log'",
|
||||
risk_level=RiskLevel.LOW,
|
||||
description="collect backup logs",
|
||||
),
|
||||
],
|
||||
success_count=20,
|
||||
failure_count=1,
|
||||
)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
mock_playbook_service.set_recommendations([
|
||||
MockPlaybookRecommendation(playbook, similarity_score=0.85)
|
||||
])
|
||||
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="backup_failure",
|
||||
alert_name="HostBackupFailed",
|
||||
)
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is True
|
||||
assert decision.blocked_by is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
|
||||
"""Test that LOW risk actions are allowed"""
|
||||
|
||||
@@ -12,12 +12,14 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會
|
||||
|
||||
### 完成
|
||||
- `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。
|
||||
- `AutoRepairService` 追加 host/backup Playbook guard:主機/備份 incident 若匹配到 K8s rollout 類 Playbook,阻擋為 `HOST_BACKUP_K8S_PLAYBOOK`,改走緊急介入。
|
||||
- `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。
|
||||
- 補 `backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。
|
||||
|
||||
### 驗證
|
||||
- `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。
|
||||
- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。
|
||||
- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 27 passed。
|
||||
- YAML parse `ops/monitoring/alerts-unified.yml`、`apps/api/alert_rules.yaml` 通過。
|
||||
|
||||
## 2026-04-30 | ADR-104 Playbook 版本化 lineage
|
||||
|
||||
Reference in New Issue
Block a user