feat(wave2+km): LLM 動態按鈕啟用 + KM 自動寫入 + AI Router dead code 標記
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m52s

- ConfigMap: USE_LLM_DYNAMIC_BUTTONS=true(B2/B3/B4 handler 全就緒)
- decision_manager: auto_execute 失敗路徑補 KM fire-and-forget 寫入
- ai_router: _build_fallback_chain 標記 DEPRECATED 2026-04-28
- tests: test_golden_regression.py 新增 172 行 golden 回歸測試

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-04-28 15:27:26 +08:00
parent 2e6ae7fe84
commit 143c15f052
4 changed files with 193 additions and 0 deletions

View File

@@ -599,6 +599,7 @@ class AIRouter:
self, selected_provider: AIProviderEnum
) -> list[tuple[AIProviderEnum, str]]:
"""
# DEPRECATED 2026-04-28 — 已由 _build_fallback_chain_for_intent 取代,無呼叫方
建立 Fallback 鏈 (排除已選 Provider)
Fallback 順序: Ollama → Gemini → Claude

View File

@@ -2098,6 +2098,11 @@ class DecisionManager:
except Exception as _cd_err:
logger.debug("auto_execute_cooldown_check_error", error=str(_cd_err))
# P1.1 fix 2026-04-27 ogt + Claude Sonnet 4.6: 失敗路徑 KM 寫入哨兵
# 在 try 外宣告,確保 except 區塊能存取(即使建構失敗)
_km_executor = None
_km_approval = None
try:
# 延遲導入避免循環依賴
from src.models.approval import ApprovalRequest, ApprovalStatus
@@ -2119,6 +2124,7 @@ class DecisionManager:
risk_level=_risk,
matched_playbook_id=_matched_playbook_id,
)
_km_approval = approval # P1.1: 供 except 失敗路徑使用
# ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6)
_metrics_before = await _fetch_metrics_snapshot(incident)
@@ -2144,6 +2150,7 @@ class DecisionManager:
# _push_auto_repair_result → Telegram 顯示 ✅ 自動修復完成,即使 K8s 拒絕了指令
# 修復: execute_approved_action 現在返回 bool正確透傳給通知函數
executor = ApprovalExecutionService()
_km_executor = executor # P1.1: 供 except 失敗路徑使用
_exec_success = await executor.execute_approved_action(approval)
# 更新狀態
@@ -2190,6 +2197,16 @@ class DecisionManager:
_push_decision_to_telegram(incident, token.proposal_data)
)
# P1.1 fix 2026-04-27 ogt + Claude Sonnet 4.6: 失敗路徑補 KM 寫入
# 根因auto_execute 拋出例外時,學習飛輪完全拿不到失敗記錄
# 修法:若 executor/approval 已建立fire-and-forget 寫入失敗 KM未建立則跳過
if _km_executor is not None and _km_approval is not None:
_fire_and_forget(
_km_executor._write_execution_result_to_km(
_km_approval, False, str(e)
)
)
async def _query_kb_context_inner(self, incident: Incident) -> str:
"""KB RAG 實際查詢邏輯,由 _query_kb_context 包裝 timeout 後呼叫"""
query_parts = list(incident.affected_services)

View File

@@ -0,0 +1,172 @@
# apps/api/tests/test_golden_regression.py
# 2026-04-28 ogt + Claude Sonnet 4.6 — P3.2.4 黃金回歸測試集
"""
30 個黃金測試案例,保護 classify_alert_early() 邏輯不退步。
涵蓋 10 類告警場景。
函數簽章:
classify_alert_early(alertname, severity, labels=None, age_hours=0.0)
-> tuple[str, str] # (alert_category, notification_type)
"""
from __future__ import annotations
import pytest
from src.services.incident_service import classify_alert_early
class TestKubernetes:
def test_pod_crash_looping(self):
cat, ntype = classify_alert_early("PodCrashLooping", "critical")
assert cat == "kubernetes" and ntype == "TYPE-3"
def test_deploy_rollout_stuck(self):
cat, ntype = classify_alert_early("DeployRolloutStuck", "warning")
assert cat == "kubernetes" and ntype == "TYPE-3"
def test_kube_node_not_ready(self):
cat, ntype = classify_alert_early("KubeNodeNotReady", "critical")
assert cat == "kubernetes" and ntype == "TYPE-3"
def test_pod_oom_killed(self):
cat, ntype = classify_alert_early("PodOOMKilled", "warning")
assert cat == "kubernetes" and ntype == "TYPE-3"
def test_node_memory_pressure(self):
cat, ntype = classify_alert_early("NodeMemoryPressure", "warning")
assert cat == "kubernetes" and ntype == "TYPE-3"
class TestHostResource:
def test_host_high_cpu_load(self):
cat, ntype = classify_alert_early("HostHighCpuLoad", "critical")
assert cat == "host_resource" and ntype == "TYPE-3"
def test_host_high_memory(self):
cat, ntype = classify_alert_early("HostHighMemory", "warning")
assert cat == "host_resource" and ntype == "TYPE-3"
def test_host_disk_space_full(self):
cat, ntype = classify_alert_early("HostDiskSpaceFull", "critical")
assert cat == "host_resource" and ntype == "TYPE-3"
class TestHighCpuVariants:
"""HighCPU* prefix 規則覆蓋"""
def test_high_cpu_prefix(self):
cat, ntype = classify_alert_early("HighCPUUsage", "critical")
assert cat == "host_resource" and ntype == "TYPE-3"
def test_high_memory_prefix(self):
cat, ntype = classify_alert_early("HighMemoryPressure", "warning")
assert cat == "host_resource" and ntype == "TYPE-3"
class TestBackup:
def test_host_backup_failed_fresh(self):
"""< 24h 備份失敗 → TYPE-1pure info"""
cat, ntype = classify_alert_early("HostBackupFailed", "warning", age_hours=10.0)
assert cat == "backup" and ntype == "TYPE-1"
def test_host_backup_failed_stale_upgrade(self):
"""> 24h 備份失敗 → 升級為 TYPE-3P0 故障)"""
cat, ntype = classify_alert_early("HostBackupFailed", "warning", age_hours=25.0)
assert cat == "backup_failure" and ntype == "TYPE-3"
def test_host_backup_stale_upgrade(self):
cat, ntype = classify_alert_early("HostBackupStale", "warning", age_hours=30.0)
assert cat == "backup_failure" and ntype == "TYPE-3"
def test_backup_restore_test_not_upgraded(self):
"""BackupRestoreTestFailed 不受 age 升級影響"""
cat, ntype = classify_alert_early("BackupRestoreTestFailed", "warning", age_hours=48.0)
assert ntype == "TYPE-1"
class TestDatabase:
def test_postgres_connections_high(self):
cat, ntype = classify_alert_early("PostgreSQLConnectionsHigh", "warning")
assert cat == "database" and ntype == "TYPE-3"
def test_postgres_disk_growth_rate(self):
cat, ntype = classify_alert_early("PostgreSQLDiskGrowthRate", "warning")
assert cat == "database" and ntype == "TYPE-3"
def test_redis_memory_full(self):
cat, ntype = classify_alert_early("RedisMemoryFull", "critical")
assert cat == "database" and ntype == "TYPE-3"
class TestInfoAndHeartbeat:
def test_severity_info_is_type1(self):
_, ntype = classify_alert_early("SomeAlert", "info")
assert ntype == "TYPE-1"
def test_severity_none_is_type1(self):
_, ntype = classify_alert_early("SomeAlert", "none")
assert ntype == "TYPE-1"
def test_watchdog_heartbeat(self):
cat, ntype = classify_alert_early("Watchdog", "warning")
assert cat == "backup" and ntype == "TYPE-1"
def test_deadmansswitch(self):
cat, ntype = classify_alert_early("DeadMansSwitch", "critical")
assert cat == "backup" and ntype == "TYPE-1"
class TestConfigDrift:
def test_configuration_drift(self):
cat, ntype = classify_alert_early("ConfigurationDrift", "warning")
assert cat == "config_drift" and ntype == "TYPE-4D"
def test_kube_config_drift(self):
cat, ntype = classify_alert_early("KubeConfigDrift", "critical")
assert cat == "config_drift" and ntype == "TYPE-4D"
class TestFlywheelHealth:
def test_auto_repair_low_success(self):
cat, ntype = classify_alert_early("AutoRepairLowSuccessRate", "critical")
assert cat == "flywheel_health" and ntype == "TYPE-8M"
def test_ollama_down(self):
cat, ntype = classify_alert_early("OllamaDown", "critical")
assert cat == "flywheel_health" and ntype == "TYPE-8M"
class TestSecops:
def test_unauthorized_ssh(self):
cat, ntype = classify_alert_early("UnauthorizedSSH", "critical")
assert cat == "secops" and ntype == "TYPE-5S"
def test_pod_abnormal_activity(self):
cat, ntype = classify_alert_early("PodAbnormalActivity", "critical")
assert cat == "secops" and ntype == "TYPE-5S"
class TestDevopsAndExternal:
def test_gitea_down(self):
cat, ntype = classify_alert_early("GiteaDown", "critical")
assert cat == "devops_tool" and ntype == "TYPE-3"
def test_mowoo_work_down(self):
cat, ntype = classify_alert_early("MoWoooWorkDown", "critical")
assert cat == "external_site" and ntype == "TYPE-3"
def test_minio_down(self):
cat, ntype = classify_alert_early("MinIODown", "critical")
assert cat == "storage" and ntype == "TYPE-3"
class TestAlertChainAndGeneral:
def test_alert_chain_broken_alertmanager(self):
cat, ntype = classify_alert_early("AlertChainBroken_Alertmanager", "critical")
assert cat == "alertchain_health" and ntype == "TYPE-8M"
def test_no_alerts_received(self):
cat, ntype = classify_alert_early("NoAlertsReceived", "critical")
assert cat == "alertchain_health" and ntype == "TYPE-8M"
def test_unknown_alert_falls_to_general(self):
cat, ntype = classify_alert_early("SomeBrandNewUnknownAlert2026", "warning")
assert cat == "general" and ntype == "TYPE-3"

View File

@@ -114,6 +114,9 @@ data:
# 根因SSH_MCP_ALLOWED_HOSTS 未設定 → _ssh_execute() 攔截 → 主機告警全部降級人工審核
# 四台主機110(DevOps金庫/wooo), 120(K3s-1/wooo), 121(K3s-2/wooo), 188(AI中心/ollama)
SSH_MCP_ALLOWED_HOSTS: "192.168.0.110,192.168.0.120,192.168.0.121,192.168.0.188"
# 2026-04-28 Claude Sonnet 4.6: Wave 2 B3 — LLM 動態 Telegram 按鈕正式啟用
# B2 dispatch_llm_action + B3 keyboard + B4 la:{short_id} handler 全部就緒
USE_LLM_DYNAMIC_BUTTONS: "true"
# MCP Phase 3 (2026-04-11 Claude Sonnet 4.6): ArgoCD + Sentry MCP 啟用
# ARGOCD_API_TOKEN 在 Secrets 中配置
ARGOCD_MCP_ENABLED: "true"