fix(api): block external site k3s playbook mismatch
This commit is contained in:
@@ -809,6 +809,9 @@ rules:
|
||||
alertname:
|
||||
- MoWoooWorkDown
|
||||
- MoWoooDevDown
|
||||
- TsenyangWebsiteDown
|
||||
- StockWoooWorkDown
|
||||
- BitanWoooWorkDown
|
||||
- ExternalSiteDown
|
||||
- WebsiteDown
|
||||
- BlackboxProbeFailed
|
||||
|
||||
@@ -137,6 +137,16 @@ _AUTO_REPAIR_GATEWAY_PROJECT_ID = "awoooi"
|
||||
_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS = 600
|
||||
_SAFE_DOCKER_CONTAINER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$")
|
||||
_UNSAFE_LEGACY_WRITE_PATTERN = re.compile(r"[;|<>`\n]|(\$\{|\$\()")
|
||||
_EXTERNAL_SITE_ALERTNAMES = {
|
||||
"MoWoooWorkDown",
|
||||
"TsenyangWebsiteDown",
|
||||
"StockWoooWorkDown",
|
||||
"BitanWoooWorkDown",
|
||||
"ExternalSiteDown",
|
||||
"WebsiteDown",
|
||||
"BlackboxProbeFailed",
|
||||
}
|
||||
_K3S_NODE_ALERTNAMES = {"K3sNodeDown", "K3sVIPDown"}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -396,6 +406,29 @@ class AutoRepairService:
|
||||
max_risk = self._get_max_risk_level(best_match.playbook)
|
||||
_is_cold_start = False
|
||||
|
||||
if (
|
||||
self._is_external_site_incident(incident)
|
||||
and self._playbook_is_k3s_node_repair(best_match.playbook)
|
||||
):
|
||||
logger.warning(
|
||||
"auto_repair_blocked_external_site_k3s_playbook",
|
||||
incident_id=incident.incident_id,
|
||||
playbook_id=best_match.playbook.playbook_id,
|
||||
alert_category=getattr(incident, "alert_category", None),
|
||||
alert_names=symptoms.alert_names,
|
||||
)
|
||||
return AutoRepairDecision(
|
||||
can_auto_repair=False,
|
||||
playbook=best_match.playbook,
|
||||
reason=(
|
||||
"外部網站探測告警不得執行 K3s node PlayBook;"
|
||||
"需使用外部站台規則、Docker 站台修復或人工接手"
|
||||
),
|
||||
risk_level=max_risk,
|
||||
blocked_by="EXTERNAL_SITE_K3S_PLAYBOOK",
|
||||
similarity_score=best_match.similarity_score,
|
||||
)
|
||||
|
||||
# 只保留: Playbook 必須是 APPROVED 狀態
|
||||
if best_match.playbook.status != PlaybookStatus.APPROVED:
|
||||
return AutoRepairDecision(
|
||||
@@ -863,6 +896,21 @@ class AutoRepairService:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_external_site_incident(self, incident: Incident) -> bool:
|
||||
"""外部網站探測告警不可被 K3s node 類 PlayBook fuzzy match 搶走。"""
|
||||
|
||||
category = (getattr(incident, "alert_category", None) or "").lower()
|
||||
if category in {"external_site", "service_404"}:
|
||||
return True
|
||||
|
||||
for signal in incident.signals or []:
|
||||
labels = signal.labels or {}
|
||||
alertname = str(labels.get("alertname") or signal.alert_name or "")
|
||||
layer = str(labels.get("layer") or "").lower()
|
||||
if alertname in _EXTERNAL_SITE_ALERTNAMES or layer == "external":
|
||||
return True
|
||||
return False
|
||||
|
||||
def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
|
||||
"""檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。"""
|
||||
|
||||
@@ -872,6 +920,23 @@ class AutoRepairService:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _playbook_is_k3s_node_repair(self, playbook: Playbook) -> bool:
|
||||
"""K3s node repair must only run for actual K3s node alerts."""
|
||||
|
||||
pattern_alerts = {
|
||||
str(alert)
|
||||
for alert in (playbook.symptom_pattern.alert_names or [])
|
||||
if alert
|
||||
}
|
||||
if pattern_alerts & _K3S_NODE_ALERTNAMES:
|
||||
return True
|
||||
|
||||
for step in playbook.repair_steps:
|
||||
command = (step.command or "").strip().lower()
|
||||
if "kubectl get nodes" in command or "kubectl describe node" in command:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
|
||||
"""非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。"""
|
||||
|
||||
|
||||
@@ -118,6 +118,30 @@ class TestRuleMatchingSpecificity:
|
||||
assert result is not None
|
||||
assert result["rule_id"] == "minio_disk_high"
|
||||
|
||||
def test_stock_site_down_matches_external_no_action_rule(self):
|
||||
ctx = {
|
||||
"alert_type": "service_404",
|
||||
"severity": "critical",
|
||||
"source": "prometheus",
|
||||
"target_resource": "stock-platform",
|
||||
"namespace": "awoooi-prod",
|
||||
"message": "stock.wooo.work probe failed",
|
||||
"labels": {
|
||||
"alertname": "StockWoooWorkDown",
|
||||
"instance": "http://stock.wooo.work",
|
||||
"layer": "external",
|
||||
"component": "stock-platform",
|
||||
"host": "110",
|
||||
},
|
||||
}
|
||||
|
||||
result = match_rule(ctx)
|
||||
|
||||
assert result is not None
|
||||
assert result["rule_id"] == "external_site_down"
|
||||
assert result["suggested_action"] == "NO_ACTION"
|
||||
assert result["kubectl_command"] == ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 阻擋案例(應返回 False)
|
||||
|
||||
@@ -348,6 +348,52 @@ class TestAutoRepairService:
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "HOST_BACKUP_K8S_PLAYBOOK"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_external_site_blocks_k3s_node_playbook(self, service, mock_playbook_service):
|
||||
"""External site probes must not fuzzy-match into K3s node repair."""
|
||||
playbook = Playbook(
|
||||
playbook_id="PB-K3S-NODE",
|
||||
name="K3s 節點下線",
|
||||
description="K3s node repair",
|
||||
status=PlaybookStatus.APPROVED,
|
||||
symptom_pattern=SymptomPattern(
|
||||
alert_names=["K3sNodeDown", "K3sVIPDown"],
|
||||
affected_services=[],
|
||||
),
|
||||
repair_steps=[
|
||||
RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.KUBECTL,
|
||||
command="kubectl get nodes -o wide && kubectl describe node {target}",
|
||||
risk_level=RiskLevel.HIGH,
|
||||
),
|
||||
],
|
||||
success_count=11,
|
||||
failure_count=2,
|
||||
)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
mock_playbook_service.set_recommendations([
|
||||
MockPlaybookRecommendation(playbook, similarity_score=0.91)
|
||||
])
|
||||
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="external_site",
|
||||
alert_name="StockWoooWorkDown",
|
||||
)
|
||||
incident.affected_services = ["stock-platform"]
|
||||
incident.signals[0].labels.update({
|
||||
"layer": "external",
|
||||
"component": "stock-platform",
|
||||
"host": "110",
|
||||
})
|
||||
|
||||
decision = await service.evaluate_auto_repair(incident)
|
||||
|
||||
assert decision.can_auto_repair is False
|
||||
assert decision.blocked_by == "EXTERNAL_SITE_K3S_PLAYBOOK"
|
||||
assert decision.playbook == playbook
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_backup_failure_allows_ssh_playbook(self, service, mock_playbook_service):
|
||||
"""Backup/host incidents may still use SSH playbooks."""
|
||||
|
||||
@@ -1,3 +1,29 @@
|
||||
## 2026-06-01|StockWoooWorkDown 防止誤配 K3s node PlayBook
|
||||
|
||||
**背景**:
|
||||
|
||||
- W-1 `auto_execute_success_rate` 的 7 日失敗中,除 Docker healthcheck legacy SSH restart 外,另有 4 筆 `StockWoooWorkDown` 被錯配到 `PB-20260416-79EB94`「K3s 節點下線」。
|
||||
- 該 PlayBook 會執行 `kubectl get nodes ... && kubectl describe node {target}`,但 `stock-platform` 是外部網站/容器服務,不是 K3s node,因此 production 失敗為 `nodes "stock-platform" not found`。
|
||||
|
||||
**本次調整**:
|
||||
|
||||
- `apps/api/alert_rules.yaml`:補上 `TsenyangWebsiteDown`、`StockWoooWorkDown`、`BitanWoooWorkDown` 到 `external_site_down`,讓這些外部站台 probe 告警明確走 `NO_ACTION` 規則。
|
||||
- `apps/api/src/services/auto_repair_service.py`:新增外部站台告警防呆;若 RAG/fuzzy recommendation 把外部站台告警推薦到 K3s node 類 PlayBook,直接以 `EXTERNAL_SITE_K3S_PLAYBOOK` 阻擋。
|
||||
- `apps/api/tests/test_auto_repair_service.py`:新增 `StockWoooWorkDown` 誤配 K3s node PlayBook 的防回歸測試。
|
||||
- `apps/api/tests/test_alert_rule_engine_validation.py`:新增 `StockWoooWorkDown` 命中 `external_site_down -> NO_ACTION` 的防回歸測試。
|
||||
|
||||
**驗證**:
|
||||
|
||||
- `python3 -m py_compile apps/api/src/services/auto_repair_service.py`
|
||||
- `DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_auto_repair_service.py apps/api/tests/test_alert_rule_engine_validation.py -q` → `65 passed`
|
||||
- `python3 scripts/security/security-mirror-progress-guard.py --root .` → `SECURITY_MIRROR_PROGRESS_GUARD_OK`
|
||||
|
||||
**進度邊界**:
|
||||
|
||||
- 整體 AI 自動化飛輪進度仍維持 `61%`。
|
||||
- 這輪封住 `StockWoooWorkDown -> K3s node repair` 的未來失敗來源;不竄改舊 7 日 SLO 歷史。
|
||||
- 若沒有新增失敗,`auto_execute_success_rate` 預估會在舊失敗滾出 7 日窗後自然回綠;若要更快回綠,需要真實新增成功 execution,而不是調整歷史資料。
|
||||
|
||||
## 2026-06-01|production auto_repair_executor MCP write grant 補套
|
||||
|
||||
**背景**:
|
||||
|
||||
Reference in New Issue
Block a user