fix(aiops): block k8s playbooks for host repair
All checks were successful
CD Pipeline / tests (push) Successful in 1m27s
Code Review / ai-code-review (push) Successful in 26s
CD Pipeline / build-and-deploy (push) Successful in 8m6s
CD Pipeline / post-deploy-checks (push) Successful in 3m31s

This commit is contained in:
Your Name
2026-05-01 10:33:28 +08:00
parent 7472eb2fcd
commit e4aef6ac4e
3 changed files with 105 additions and 2 deletions

View File

@@ -337,6 +337,23 @@ class AutoRepairService:
blocked_by="NOT_APPROVED",
)
if self._is_host_or_backup_incident(incident) and self._playbook_has_k8s_steps(best_match.playbook):
logger.warning(
"auto_repair_blocked_host_backup_k8s_playbook",
incident_id=incident.incident_id,
playbook_id=best_match.playbook.playbook_id,
alert_category=getattr(incident, "alert_category", None),
)
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=(
"主機/備份類告警禁止執行 K8s Playbook"
"需改走 SSH 診斷或緊急介入"
),
blocked_by="HOST_BACKUP_K8S_PLAYBOOK",
)
# 5. 可以自動修復
logger.info(
"auto_repair_approved",
@@ -676,6 +693,29 @@ class AutoRepairService:
return max_risk
def _is_host_or_backup_incident(self, incident: Incident) -> bool:
"""主機/備份類事件只能走 SSH/只讀診斷,不允許 K8s rollout 類修復。"""
category = (getattr(incident, "alert_category", None) or "").lower()
if category in {"host_resource", "backup_failure"}:
return True
for signal in incident.signals or []:
labels = signal.labels or {}
alertname = str(labels.get("alertname") or signal.alert_name or "")
if alertname.startswith("HostBackup") or alertname.startswith("Host"):
return True
return False
def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
"""檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。"""
for step in playbook.repair_steps:
command = (step.command or "").strip().lower()
if step.action_type == ActionType.KUBECTL or command.startswith("kubectl "):
return True
return False
def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
"""檢查風險是否超過自動修復門檻"""
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}

View File

@@ -55,6 +55,8 @@ class MockPlaybookService:
def create_test_incident(
incident_id: str = "INC-TEST-001",
severity: Severity = Severity.P2,
alert_category: str | None = None,
alert_name: str = "HighCPU",
) -> Incident:
"""Create a test incident"""
now = now_taipei()
@@ -63,13 +65,14 @@ def create_test_incident(
status=IncidentStatus.INVESTIGATING,
severity=severity,
affected_services=["test-service"],
alert_category=alert_category,
signals=[
Signal(
alert_name="HighCPU",
alert_name=alert_name,
severity=severity,
source="prometheus",
fired_at=now,
labels={"namespace": "prod"},
labels={"namespace": "prod", "alertname": alert_name},
),
],
)
@@ -274,6 +277,64 @@ class TestAutoRepairService:
assert decision.playbook.playbook_id == playbook.playbook_id
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_backup_failure_blocks_k8s_playbook(self, service, mock_playbook_service):
"""Backup/host incidents must not execute K8s rollout playbooks."""
playbook = create_high_quality_playbook(risk_level=RiskLevel.MEDIUM)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.85)
])
incident = create_test_incident(
severity=Severity.P2,
alert_category="backup_failure",
alert_name="HostBackupFailed",
)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is False
assert decision.blocked_by == "HOST_BACKUP_K8S_PLAYBOOK"
@pytest.mark.asyncio
async def test_backup_failure_allows_ssh_playbook(self, service, mock_playbook_service):
"""Backup/host incidents may still use SSH playbooks."""
playbook = Playbook(
playbook_id="PB-BACKUP-SSH",
name="Backup SSH diagnostics",
description="Read-only backup diagnosis",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["HostBackupFailed"],
affected_services=["test-service"],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.SSH_COMMAND,
command="ssh {host} 'tail -80 /var/log/backup.log'",
risk_level=RiskLevel.LOW,
description="collect backup logs",
),
],
success_count=20,
failure_count=1,
)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.85)
])
incident = create_test_incident(
severity=Severity.P2,
alert_category="backup_failure",
alert_name="HostBackupFailed",
)
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is True
assert decision.blocked_by is None
@pytest.mark.asyncio
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
"""Test that LOW risk actions are allowed"""

View File

@@ -12,12 +12,14 @@ Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會
### 完成
- `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。
- `AutoRepairService` 追加 host/backup Playbook guard主機/備份 incident 若匹配到 K8s rollout 類 Playbook阻擋為 `HOST_BACKUP_K8S_PLAYBOOK`,改走緊急介入。
- `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。
-`backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。
### 驗證
- `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。
- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。
- `cd apps/api && pytest tests/test_auto_repair_service.py tests/test_alertmanager_rule_bypass.py -q` → 27 passed。
- YAML parse `ops/monitoring/alerts-unified.yml``apps/api/alert_rules.yaml` 通過。
## 2026-04-30 | ADR-104 Playbook 版本化 lineage