fix(api): block external site k3s playbook mismatch
All checks were successful
CD Pipeline / tests (push) Successful in 1m22s
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / build-and-deploy (push) Successful in 3m45s
CD Pipeline / post-deploy-checks (push) Successful in 1m30s

This commit is contained in:
Your Name
2026-06-01 17:28:32 +08:00
parent 1095425303
commit 8c5605fadf
5 changed files with 164 additions and 0 deletions

View File

@@ -809,6 +809,9 @@ rules:
alertname:
- MoWoooWorkDown
- MoWoooDevDown
- TsenyangWebsiteDown
- StockWoooWorkDown
- BitanWoooWorkDown
- ExternalSiteDown
- WebsiteDown
- BlackboxProbeFailed

View File

@@ -137,6 +137,16 @@ _AUTO_REPAIR_GATEWAY_PROJECT_ID = "awoooi"
_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS = 600
_SAFE_DOCKER_CONTAINER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$")
_UNSAFE_LEGACY_WRITE_PATTERN = re.compile(r"[;|<>`\n]|(\$\{|\$\()")
_EXTERNAL_SITE_ALERTNAMES = {
"MoWoooWorkDown",
"TsenyangWebsiteDown",
"StockWoooWorkDown",
"BitanWoooWorkDown",
"ExternalSiteDown",
"WebsiteDown",
"BlackboxProbeFailed",
}
_K3S_NODE_ALERTNAMES = {"K3sNodeDown", "K3sVIPDown"}
# =============================================================================
@@ -396,6 +406,29 @@ class AutoRepairService:
max_risk = self._get_max_risk_level(best_match.playbook)
_is_cold_start = False
if (
self._is_external_site_incident(incident)
and self._playbook_is_k3s_node_repair(best_match.playbook)
):
logger.warning(
"auto_repair_blocked_external_site_k3s_playbook",
incident_id=incident.incident_id,
playbook_id=best_match.playbook.playbook_id,
alert_category=getattr(incident, "alert_category", None),
alert_names=symptoms.alert_names,
)
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=(
"外部網站探測告警不得執行 K3s node PlayBook"
"需使用外部站台規則、Docker 站台修復或人工接手"
),
risk_level=max_risk,
blocked_by="EXTERNAL_SITE_K3S_PLAYBOOK",
similarity_score=best_match.similarity_score,
)
# 只保留: Playbook 必須是 APPROVED 狀態
if best_match.playbook.status != PlaybookStatus.APPROVED:
return AutoRepairDecision(
@@ -863,6 +896,21 @@ class AutoRepairService:
return True
return False
def _is_external_site_incident(self, incident: Incident) -> bool:
"""外部網站探測告警不可被 K3s node 類 PlayBook fuzzy match 搶走。"""
category = (getattr(incident, "alert_category", None) or "").lower()
if category in {"external_site", "service_404"}:
return True
for signal in incident.signals or []:
labels = signal.labels or {}
alertname = str(labels.get("alertname") or signal.alert_name or "")
layer = str(labels.get("layer") or "").lower()
if alertname in _EXTERNAL_SITE_ALERTNAMES or layer == "external":
return True
return False
def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
"""檢查 Playbook 是否包含 K8s 指令,避免主機告警誤執行 deployment 操作。"""
@@ -872,6 +920,23 @@ class AutoRepairService:
return True
return False
def _playbook_is_k3s_node_repair(self, playbook: Playbook) -> bool:
"""K3s node repair must only run for actual K3s node alerts."""
pattern_alerts = {
str(alert)
for alert in (playbook.symptom_pattern.alert_names or [])
if alert
}
if pattern_alerts & _K3S_NODE_ALERTNAMES:
return True
for step in playbook.repair_steps:
command = (step.command or "").strip().lower()
if "kubectl get nodes" in command or "kubectl describe node" in command:
return True
return False
def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
"""非 K8s 修復或主機/備份事件驗證失敗時,禁止合成 K8s rollback。"""

View File

@@ -118,6 +118,30 @@ class TestRuleMatchingSpecificity:
assert result is not None
assert result["rule_id"] == "minio_disk_high"
def test_stock_site_down_matches_external_no_action_rule(self):
ctx = {
"alert_type": "service_404",
"severity": "critical",
"source": "prometheus",
"target_resource": "stock-platform",
"namespace": "awoooi-prod",
"message": "stock.wooo.work probe failed",
"labels": {
"alertname": "StockWoooWorkDown",
"instance": "http://stock.wooo.work",
"layer": "external",
"component": "stock-platform",
"host": "110",
},
}
result = match_rule(ctx)
assert result is not None
assert result["rule_id"] == "external_site_down"
assert result["suggested_action"] == "NO_ACTION"
assert result["kubectl_command"] == ""
# =============================================================================
# 阻擋案例(應返回 False

View File

@@ -348,6 +348,52 @@ class TestAutoRepairService:
assert decision.can_auto_repair is False
assert decision.blocked_by == "HOST_BACKUP_K8S_PLAYBOOK"
@pytest.mark.asyncio
async def test_external_site_blocks_k3s_node_playbook(self, service, mock_playbook_service):
"""External site probes must not fuzzy-match into K3s node repair."""
playbook = Playbook(
playbook_id="PB-K3S-NODE",
name="K3s 節點下線",
description="K3s node repair",
status=PlaybookStatus.APPROVED,
symptom_pattern=SymptomPattern(
alert_names=["K3sNodeDown", "K3sVIPDown"],
affected_services=[],
),
repair_steps=[
RepairStep(
step_number=1,
action_type=ActionType.KUBECTL,
command="kubectl get nodes -o wide && kubectl describe node {target}",
risk_level=RiskLevel.HIGH,
),
],
success_count=11,
failure_count=2,
)
mock_playbook_service.add_playbook(playbook)
mock_playbook_service.set_recommendations([
MockPlaybookRecommendation(playbook, similarity_score=0.91)
])
incident = create_test_incident(
severity=Severity.P2,
alert_category="external_site",
alert_name="StockWoooWorkDown",
)
incident.affected_services = ["stock-platform"]
incident.signals[0].labels.update({
"layer": "external",
"component": "stock-platform",
"host": "110",
})
decision = await service.evaluate_auto_repair(incident)
assert decision.can_auto_repair is False
assert decision.blocked_by == "EXTERNAL_SITE_K3S_PLAYBOOK"
assert decision.playbook == playbook
@pytest.mark.asyncio
async def test_backup_failure_allows_ssh_playbook(self, service, mock_playbook_service):
"""Backup/host incidents may still use SSH playbooks."""

View File

@@ -1,3 +1,29 @@
## 2026-06-01StockWoooWorkDown 防止誤配 K3s node PlayBook
**背景**
- W-1 `auto_execute_success_rate` 的 7 日失敗中,除 Docker healthcheck legacy SSH restart 外,另有 4 筆 `StockWoooWorkDown` 被錯配到 `PB-20260416-79EB94`「K3s 節點下線」。
- 該 PlayBook 會執行 `kubectl get nodes ... && kubectl describe node {target}`,但 `stock-platform` 是外部網站/容器服務,不是 K3s node因此 production 失敗為 `nodes "stock-platform" not found`
**本次調整**
- `apps/api/alert_rules.yaml`:補上 `TsenyangWebsiteDown``StockWoooWorkDown``BitanWoooWorkDown``external_site_down`,讓這些外部站台 probe 告警明確走 `NO_ACTION` 規則。
- `apps/api/src/services/auto_repair_service.py`:新增外部站台告警防呆;若 RAG/fuzzy recommendation 把外部站台告警推薦到 K3s node 類 PlayBook直接以 `EXTERNAL_SITE_K3S_PLAYBOOK` 阻擋。
- `apps/api/tests/test_auto_repair_service.py`:新增 `StockWoooWorkDown` 誤配 K3s node PlayBook 的防回歸測試。
- `apps/api/tests/test_alert_rule_engine_validation.py`:新增 `StockWoooWorkDown` 命中 `external_site_down -> NO_ACTION` 的防回歸測試。
**驗證**
- `python3 -m py_compile apps/api/src/services/auto_repair_service.py`
- `DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_auto_repair_service.py apps/api/tests/test_alert_rule_engine_validation.py -q``65 passed`
- `python3 scripts/security/security-mirror-progress-guard.py --root .``SECURITY_MIRROR_PROGRESS_GUARD_OK`
**進度邊界**
- 整體 AI 自動化飛輪進度仍維持 `61%`
- 這輪封住 `StockWoooWorkDown -> K3s node repair` 的未來失敗來源;不竄改舊 7 日 SLO 歷史。
- 若沒有新增失敗,`auto_execute_success_rate` 預估會在舊失敗滾出 7 日窗後自然回綠;若要更快回綠,需要真實新增成功 execution而不是調整歷史資料。
## 2026-06-01production auto_repair_executor MCP write grant 補套
**背景**