feat(wave2+km): LLM 動態按鈕啟用 + KM 自動寫入 + AI Router dead code 標記
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m52s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m52s
- ConfigMap: USE_LLM_DYNAMIC_BUTTONS=true(B2/B3/B4 handler 全就緒) - decision_manager: auto_execute 失敗路徑補 KM fire-and-forget 寫入 - ai_router: _build_fallback_chain 標記 DEPRECATED 2026-04-28 - tests: test_golden_regression.py 新增 172 行 golden 回歸測試 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -599,6 +599,7 @@ class AIRouter:
|
||||
self, selected_provider: AIProviderEnum
|
||||
) -> list[tuple[AIProviderEnum, str]]:
|
||||
"""
|
||||
# DEPRECATED 2026-04-28 — 已由 _build_fallback_chain_for_intent 取代,無呼叫方
|
||||
建立 Fallback 鏈 (排除已選 Provider)
|
||||
|
||||
Fallback 順序: Ollama → Gemini → Claude
|
||||
|
||||
@@ -2098,6 +2098,11 @@ class DecisionManager:
|
||||
except Exception as _cd_err:
|
||||
logger.debug("auto_execute_cooldown_check_error", error=str(_cd_err))
|
||||
|
||||
# P1.1 fix 2026-04-27 ogt + Claude Sonnet 4.6: 失敗路徑 KM 寫入哨兵
|
||||
# 在 try 外宣告,確保 except 區塊能存取(即使建構失敗)
|
||||
_km_executor = None
|
||||
_km_approval = None
|
||||
|
||||
try:
|
||||
# 延遲導入避免循環依賴
|
||||
from src.models.approval import ApprovalRequest, ApprovalStatus
|
||||
@@ -2119,6 +2124,7 @@ class DecisionManager:
|
||||
risk_level=_risk,
|
||||
matched_playbook_id=_matched_playbook_id,
|
||||
)
|
||||
_km_approval = approval # P1.1: 供 except 失敗路徑使用
|
||||
|
||||
# ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6)
|
||||
_metrics_before = await _fetch_metrics_snapshot(incident)
|
||||
@@ -2144,6 +2150,7 @@ class DecisionManager:
|
||||
# _push_auto_repair_result → Telegram 顯示 ✅ 自動修復完成,即使 K8s 拒絕了指令
|
||||
# 修復: execute_approved_action 現在返回 bool,正確透傳給通知函數
|
||||
executor = ApprovalExecutionService()
|
||||
_km_executor = executor # P1.1: 供 except 失敗路徑使用
|
||||
_exec_success = await executor.execute_approved_action(approval)
|
||||
|
||||
# 更新狀態
|
||||
@@ -2190,6 +2197,16 @@ class DecisionManager:
|
||||
_push_decision_to_telegram(incident, token.proposal_data)
|
||||
)
|
||||
|
||||
# P1.1 fix 2026-04-27 ogt + Claude Sonnet 4.6: 失敗路徑補 KM 寫入
|
||||
# 根因:auto_execute 拋出例外時,學習飛輪完全拿不到失敗記錄
|
||||
# 修法:若 executor/approval 已建立,fire-and-forget 寫入失敗 KM;未建立則跳過
|
||||
if _km_executor is not None and _km_approval is not None:
|
||||
_fire_and_forget(
|
||||
_km_executor._write_execution_result_to_km(
|
||||
_km_approval, False, str(e)
|
||||
)
|
||||
)
|
||||
|
||||
async def _query_kb_context_inner(self, incident: Incident) -> str:
|
||||
"""KB RAG 實際查詢邏輯,由 _query_kb_context 包裝 timeout 後呼叫"""
|
||||
query_parts = list(incident.affected_services)
|
||||
|
||||
172
apps/api/tests/test_golden_regression.py
Normal file
172
apps/api/tests/test_golden_regression.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# apps/api/tests/test_golden_regression.py
|
||||
# 2026-04-28 ogt + Claude Sonnet 4.6 — P3.2.4 黃金回歸測試集
|
||||
"""
|
||||
30 個黃金測試案例,保護 classify_alert_early() 邏輯不退步。
|
||||
涵蓋 10 類告警場景。
|
||||
|
||||
函數簽章:
|
||||
classify_alert_early(alertname, severity, labels=None, age_hours=0.0)
|
||||
-> tuple[str, str] # (alert_category, notification_type)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from src.services.incident_service import classify_alert_early
|
||||
|
||||
|
||||
class TestKubernetes:
|
||||
def test_pod_crash_looping(self):
|
||||
cat, ntype = classify_alert_early("PodCrashLooping", "critical")
|
||||
assert cat == "kubernetes" and ntype == "TYPE-3"
|
||||
|
||||
def test_deploy_rollout_stuck(self):
|
||||
cat, ntype = classify_alert_early("DeployRolloutStuck", "warning")
|
||||
assert cat == "kubernetes" and ntype == "TYPE-3"
|
||||
|
||||
def test_kube_node_not_ready(self):
|
||||
cat, ntype = classify_alert_early("KubeNodeNotReady", "critical")
|
||||
assert cat == "kubernetes" and ntype == "TYPE-3"
|
||||
|
||||
def test_pod_oom_killed(self):
|
||||
cat, ntype = classify_alert_early("PodOOMKilled", "warning")
|
||||
assert cat == "kubernetes" and ntype == "TYPE-3"
|
||||
|
||||
def test_node_memory_pressure(self):
|
||||
cat, ntype = classify_alert_early("NodeMemoryPressure", "warning")
|
||||
assert cat == "kubernetes" and ntype == "TYPE-3"
|
||||
|
||||
|
||||
class TestHostResource:
|
||||
def test_host_high_cpu_load(self):
|
||||
cat, ntype = classify_alert_early("HostHighCpuLoad", "critical")
|
||||
assert cat == "host_resource" and ntype == "TYPE-3"
|
||||
|
||||
def test_host_high_memory(self):
|
||||
cat, ntype = classify_alert_early("HostHighMemory", "warning")
|
||||
assert cat == "host_resource" and ntype == "TYPE-3"
|
||||
|
||||
def test_host_disk_space_full(self):
|
||||
cat, ntype = classify_alert_early("HostDiskSpaceFull", "critical")
|
||||
assert cat == "host_resource" and ntype == "TYPE-3"
|
||||
|
||||
|
||||
class TestHighCpuVariants:
|
||||
"""HighCPU* prefix 規則覆蓋"""
|
||||
def test_high_cpu_prefix(self):
|
||||
cat, ntype = classify_alert_early("HighCPUUsage", "critical")
|
||||
assert cat == "host_resource" and ntype == "TYPE-3"
|
||||
|
||||
def test_high_memory_prefix(self):
|
||||
cat, ntype = classify_alert_early("HighMemoryPressure", "warning")
|
||||
assert cat == "host_resource" and ntype == "TYPE-3"
|
||||
|
||||
|
||||
class TestBackup:
|
||||
def test_host_backup_failed_fresh(self):
|
||||
"""< 24h 備份失敗 → TYPE-1(pure info)"""
|
||||
cat, ntype = classify_alert_early("HostBackupFailed", "warning", age_hours=10.0)
|
||||
assert cat == "backup" and ntype == "TYPE-1"
|
||||
|
||||
def test_host_backup_failed_stale_upgrade(self):
|
||||
"""> 24h 備份失敗 → 升級為 TYPE-3(P0 故障)"""
|
||||
cat, ntype = classify_alert_early("HostBackupFailed", "warning", age_hours=25.0)
|
||||
assert cat == "backup_failure" and ntype == "TYPE-3"
|
||||
|
||||
def test_host_backup_stale_upgrade(self):
|
||||
cat, ntype = classify_alert_early("HostBackupStale", "warning", age_hours=30.0)
|
||||
assert cat == "backup_failure" and ntype == "TYPE-3"
|
||||
|
||||
def test_backup_restore_test_not_upgraded(self):
|
||||
"""BackupRestoreTestFailed 不受 age 升級影響"""
|
||||
cat, ntype = classify_alert_early("BackupRestoreTestFailed", "warning", age_hours=48.0)
|
||||
assert ntype == "TYPE-1"
|
||||
|
||||
|
||||
class TestDatabase:
|
||||
def test_postgres_connections_high(self):
|
||||
cat, ntype = classify_alert_early("PostgreSQLConnectionsHigh", "warning")
|
||||
assert cat == "database" and ntype == "TYPE-3"
|
||||
|
||||
def test_postgres_disk_growth_rate(self):
|
||||
cat, ntype = classify_alert_early("PostgreSQLDiskGrowthRate", "warning")
|
||||
assert cat == "database" and ntype == "TYPE-3"
|
||||
|
||||
def test_redis_memory_full(self):
|
||||
cat, ntype = classify_alert_early("RedisMemoryFull", "critical")
|
||||
assert cat == "database" and ntype == "TYPE-3"
|
||||
|
||||
|
||||
class TestInfoAndHeartbeat:
|
||||
def test_severity_info_is_type1(self):
|
||||
_, ntype = classify_alert_early("SomeAlert", "info")
|
||||
assert ntype == "TYPE-1"
|
||||
|
||||
def test_severity_none_is_type1(self):
|
||||
_, ntype = classify_alert_early("SomeAlert", "none")
|
||||
assert ntype == "TYPE-1"
|
||||
|
||||
def test_watchdog_heartbeat(self):
|
||||
cat, ntype = classify_alert_early("Watchdog", "warning")
|
||||
assert cat == "backup" and ntype == "TYPE-1"
|
||||
|
||||
def test_deadmansswitch(self):
|
||||
cat, ntype = classify_alert_early("DeadMansSwitch", "critical")
|
||||
assert cat == "backup" and ntype == "TYPE-1"
|
||||
|
||||
|
||||
class TestConfigDrift:
|
||||
def test_configuration_drift(self):
|
||||
cat, ntype = classify_alert_early("ConfigurationDrift", "warning")
|
||||
assert cat == "config_drift" and ntype == "TYPE-4D"
|
||||
|
||||
def test_kube_config_drift(self):
|
||||
cat, ntype = classify_alert_early("KubeConfigDrift", "critical")
|
||||
assert cat == "config_drift" and ntype == "TYPE-4D"
|
||||
|
||||
|
||||
class TestFlywheelHealth:
|
||||
def test_auto_repair_low_success(self):
|
||||
cat, ntype = classify_alert_early("AutoRepairLowSuccessRate", "critical")
|
||||
assert cat == "flywheel_health" and ntype == "TYPE-8M"
|
||||
|
||||
def test_ollama_down(self):
|
||||
cat, ntype = classify_alert_early("OllamaDown", "critical")
|
||||
assert cat == "flywheel_health" and ntype == "TYPE-8M"
|
||||
|
||||
|
||||
class TestSecops:
|
||||
def test_unauthorized_ssh(self):
|
||||
cat, ntype = classify_alert_early("UnauthorizedSSH", "critical")
|
||||
assert cat == "secops" and ntype == "TYPE-5S"
|
||||
|
||||
def test_pod_abnormal_activity(self):
|
||||
cat, ntype = classify_alert_early("PodAbnormalActivity", "critical")
|
||||
assert cat == "secops" and ntype == "TYPE-5S"
|
||||
|
||||
|
||||
class TestDevopsAndExternal:
|
||||
def test_gitea_down(self):
|
||||
cat, ntype = classify_alert_early("GiteaDown", "critical")
|
||||
assert cat == "devops_tool" and ntype == "TYPE-3"
|
||||
|
||||
def test_mowoo_work_down(self):
|
||||
cat, ntype = classify_alert_early("MoWoooWorkDown", "critical")
|
||||
assert cat == "external_site" and ntype == "TYPE-3"
|
||||
|
||||
def test_minio_down(self):
|
||||
cat, ntype = classify_alert_early("MinIODown", "critical")
|
||||
assert cat == "storage" and ntype == "TYPE-3"
|
||||
|
||||
|
||||
class TestAlertChainAndGeneral:
|
||||
def test_alert_chain_broken_alertmanager(self):
|
||||
cat, ntype = classify_alert_early("AlertChainBroken_Alertmanager", "critical")
|
||||
assert cat == "alertchain_health" and ntype == "TYPE-8M"
|
||||
|
||||
def test_no_alerts_received(self):
|
||||
cat, ntype = classify_alert_early("NoAlertsReceived", "critical")
|
||||
assert cat == "alertchain_health" and ntype == "TYPE-8M"
|
||||
|
||||
def test_unknown_alert_falls_to_general(self):
|
||||
cat, ntype = classify_alert_early("SomeBrandNewUnknownAlert2026", "warning")
|
||||
assert cat == "general" and ntype == "TYPE-3"
|
||||
@@ -114,6 +114,9 @@ data:
|
||||
# 根因:SSH_MCP_ALLOWED_HOSTS 未設定 → _ssh_execute() 攔截 → 主機告警全部降級人工審核
|
||||
# 四台主機:110(DevOps金庫/wooo), 120(K3s-1/wooo), 121(K3s-2/wooo), 188(AI中心/ollama)
|
||||
SSH_MCP_ALLOWED_HOSTS: "192.168.0.110,192.168.0.120,192.168.0.121,192.168.0.188"
|
||||
# 2026-04-28 Claude Sonnet 4.6: Wave 2 B3 — LLM 動態 Telegram 按鈕正式啟用
|
||||
# B2 dispatch_llm_action + B3 keyboard + B4 la:{short_id} handler 全部就緒
|
||||
USE_LLM_DYNAMIC_BUTTONS: "true"
|
||||
# MCP Phase 3 (2026-04-11 Claude Sonnet 4.6): ArgoCD + Sentry MCP 啟用
|
||||
# ARGOCD_API_TOKEN 在 Secrets 中配置
|
||||
ARGOCD_MCP_ENABLED: "true"
|
||||
|
||||
Reference in New Issue
Block a user